From 31046bfa1ae7b14115d30e02b5d7531bb19d7a3c Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Tue, 7 Sep 2021 20:57:02 -0500
Subject: [PATCH 01/18] DBG Use scikit 0.24

---
 ci/gpu/build.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index e646307cf6..737acdd038 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -199,6 +199,10 @@ else
     pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
     set +x
 
+    # https://docs.rapids.ai/maintainers/depmgmt/
+    gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
+    gpuci_conda_retry install -y "scikit-learn=0.24"
+
     gpuci_logger "Building cuml"
     "$WORKSPACE/build.sh" -v cuml --codecov
 

From 2aeec3deb0535d1a37f8569703f82aeaf8fbf112 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Wed, 8 Sep 2021 09:19:45 -0500
Subject: [PATCH 02/18] FIX Import of trustworthiness for scikit-learn 0.24

---
 python/cuml/test/test_pickle.py          | 2 +-
 python/cuml/test/test_trustworthiness.py | 2 +-
 python/cuml/test/test_tsne.py            | 2 +-
 python/cuml/test/test_umap.py            | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/cuml/test/test_pickle.py b/python/cuml/test/test_pickle.py
index adf912f6f1..d961642a60 100644
--- a/python/cuml/test/test_pickle.py
+++ b/python/cuml/test/test_pickle.py
@@ -24,7 +24,7 @@
 from cuml.test.test_svm import compare_svm, compare_probabilistic_svm
 from sklearn.base import clone
 from sklearn.datasets import load_iris, make_classification, make_regression
-from sklearn.manifold.t_sne import trustworthiness
+from sklearn.manifold import trustworthiness
 from sklearn.model_selection import train_test_split
 
 
diff --git a/python/cuml/test/test_trustworthiness.py b/python/cuml/test/test_trustworthiness.py
index 93819c1a4d..7a2a117067 100644
--- a/python/cuml/test/test_trustworthiness.py
+++ b/python/cuml/test/test_trustworthiness.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import pytest
-from sklearn.manifold.t_sne import trustworthiness as sklearn_trustworthiness
+from sklearn.manifold import trustworthiness as sklearn_trustworthiness
 from cuml.metrics import trustworthiness as cuml_trustworthiness
 
 from sklearn.datasets import make_blobs
diff --git a/python/cuml/test/test_tsne.py b/python/cuml/test/test_tsne.py
index fcff0a49c5..8158543666 100644
--- a/python/cuml/test/test_tsne.py
+++ b/python/cuml/test/test_tsne.py
@@ -23,7 +23,7 @@
 from cuml.neighbors import NearestNeighbors as cuKNN
 
 from sklearn.datasets import make_blobs
-from sklearn.manifold.t_sne import trustworthiness
+from sklearn.manifold import trustworthiness
 from sklearn import datasets
 
 
diff --git a/python/cuml/test/test_umap.py b/python/cuml/test/test_umap.py
index 5f8c54fc05..28b84e9375 100644
--- a/python/cuml/test/test_umap.py
+++ b/python/cuml/test/test_umap.py
@@ -37,7 +37,7 @@
 from sklearn import datasets
 from sklearn.cluster import KMeans
 from sklearn.datasets import make_blobs
-from sklearn.manifold.t_sne import trustworthiness
+from sklearn.manifold import trustworthiness
 from sklearn.metrics import adjusted_rand_score
 
 dataset_names = ['iris', 'digits', 'wine', 'blobs']

From 29032fa0ff631372a6a116aa78f735f9d2f52845 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Thu, 9 Sep 2021 13:15:10 -0500
Subject: [PATCH 03/18] FIX Remove deprecate parameter

---
 python/cuml/test/test_metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuml/test/test_metrics.py b/python/cuml/test/test_metrics.py
index 8d642b72fd..bb18ecddd4 100644
--- a/python/cuml/test/test_metrics.py
+++ b/python/cuml/test/test_metrics.py
@@ -152,7 +152,7 @@ def test_sklearn_search():
                        normalize=normalize, solver="eig")
 
     assert getattr(cu_clf, 'score', False)
-    sk_cu_grid = GridSearchCV(cu_clf, params, cv=5, iid=False)
+    sk_cu_grid = GridSearchCV(cu_clf, params, cv=5)
 
     gdf_data = cudf.DataFrame(X_train)
     gdf_train = cudf.DataFrame(dict(train=y_train))

From cc8a7de00ccc5c16b71a09e552c9ebf861266662 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Thu, 9 Sep 2021 13:27:31 -0500
Subject: [PATCH 04/18] FIX Remove cupy sparse deprecated imports

---
 python/cuml/_thirdparty/sklearn/utils/validation.py | 2 +-
 python/cuml/common/sparsefuncs.py                   | 2 +-
 python/cuml/naive_bayes/naive_bayes.py              | 2 +-
 python/cuml/preprocessing/encoders.py               | 2 +-
 python/cuml/test/dask/test_tfidf.py                 | 2 +-
 python/cuml/test/test_preproc_utils.py              | 8 ++++----
 python/cuml/thirdparty_adapters/adapters.py         | 6 +++---
 7 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/python/cuml/_thirdparty/sklearn/utils/validation.py b/python/cuml/_thirdparty/sklearn/utils/validation.py
index 62581b7cdc..0db8acc154 100644
--- a/python/cuml/_thirdparty/sklearn/utils/validation.py
+++ b/python/cuml/_thirdparty/sklearn/utils/validation.py
@@ -18,7 +18,7 @@
 import numbers
 import numpy as np
 import cupy as cp
-import cupy.sparse as sp
+import cupyx.scipy.sparse as sp
 from inspect import isclass
 
 from ....common.exceptions import NotFittedError
diff --git a/python/cuml/common/sparsefuncs.py b/python/cuml/common/sparsefuncs.py
index cbed90566d..c870be97fa 100644
--- a/python/cuml/common/sparsefuncs.py
+++ b/python/cuml/common/sparsefuncs.py
@@ -22,7 +22,7 @@
 from cuml.common.import_utils import has_scipy
 import cuml.internals
 from cuml.common.kernel_utils import cuda_kernel_factory
-from cupy.sparse import csr_matrix as cp_csr_matrix,\
+from cupyx.scipy.sparse import csr_matrix as cp_csr_matrix,\
     coo_matrix as cp_coo_matrix, csc_matrix as cp_csc_matrix
 
 
diff --git a/python/cuml/naive_bayes/naive_bayes.py b/python/cuml/naive_bayes/naive_bayes.py
index 4118d41f04..73ef26fbe2 100644
--- a/python/cuml/naive_bayes/naive_bayes.py
+++ b/python/cuml/naive_bayes/naive_bayes.py
@@ -821,7 +821,7 @@ def _count(self, X, Y, classes):
         Sum feature counts & class prior counts and add to current model.
         Parameters
         ----------
-        X : cupy.ndarray or cupy.sparse matrix of size
+        X : cupy.ndarray or cupyx.scipy.sparse matrix of size
                   (n_rows, n_features)
         Y : cupy.array of monotonic class labels
         """
diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py
index bb5c2b87cb..77676080ef 100644
--- a/python/cuml/preprocessing/encoders.py
+++ b/python/cuml/preprocessing/encoders.py
@@ -107,7 +107,7 @@ def __init__(self, *,
                  categories='auto',
                  drop=None,
                  sparse=True,
-                 dtype=np.float,
+                 dtype=np.float32,
                  handle_unknown='error',
                  handle=None,
                  verbose=False,
diff --git a/python/cuml/test/dask/test_tfidf.py b/python/cuml/test/dask/test_tfidf.py
index 9461f07e2d..ae79c7426a 100644
--- a/python/cuml/test/dask/test_tfidf.py
+++ b/python/cuml/test/dask/test_tfidf.py
@@ -17,7 +17,7 @@
 import numpy as np
 import cupy as cp
 from scipy.sparse import csr_matrix as scipy_csr_matrix
-from cupy.sparse import csr_matrix as cp_csr_matrix
+from cupyx.scipy.sparse import csr_matrix as cp_csr_matrix
 import dask.array as da
 import dask
 
diff --git a/python/cuml/test/test_preproc_utils.py b/python/cuml/test/test_preproc_utils.py
index 6728bcd1d4..52caa74517 100644
--- a/python/cuml/test/test_preproc_utils.py
+++ b/python/cuml/test/test_preproc_utils.py
@@ -20,11 +20,11 @@
 
 import numpy as np
 import cupy as cp
-import cupy.sparse as gpu_sparse
+import cupyx.scipy.sparse as gpu_sparse
 import scipy.sparse as cpu_sparse
-from cupy.sparse import csr_matrix as gpu_csr_matrix
-from cupy.sparse import csc_matrix as gpu_csc_matrix
-from cupy.sparse import coo_matrix as gpu_coo_matrix
+from cupyx.scipy.sparse import csr_matrix as gpu_csr_matrix
+from cupyx.scipy.sparse import csc_matrix as gpu_csc_matrix
+from cupyx.scipy.sparse import coo_matrix as gpu_coo_matrix
 from scipy.sparse import csr_matrix as cpu_csr_matrix
 from scipy.sparse import csc_matrix as cpu_csc_matrix
 from scipy.sparse import coo_matrix as cpu_coo_matrix
diff --git a/python/cuml/thirdparty_adapters/adapters.py b/python/cuml/thirdparty_adapters/adapters.py
index 6b67dd7b99..9c369fbfed 100644
--- a/python/cuml/thirdparty_adapters/adapters.py
+++ b/python/cuml/thirdparty_adapters/adapters.py
@@ -16,9 +16,9 @@
 import numpy as np
 import cupy as cp
 from cuml.common.input_utils import input_to_cupy_array
-from cupy.sparse import csr_matrix as gpu_csr_matrix
-from cupy.sparse import csc_matrix as gpu_csc_matrix
-from cupy.sparse import csc_matrix as gpu_coo_matrix
+from cupyx.scipy.sparse import csr_matrix as gpu_csr_matrix
+from cupyx.scipy.sparse import csc_matrix as gpu_csc_matrix
+from cupyx.scipy.sparse import csc_matrix as gpu_coo_matrix
 from scipy import sparse as cpu_sparse
 from cupy import sparse as gpu_sparse
 

From b6065de63e16fa0df2bed8ff582c8ea937462ae9 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Thu, 9 Sep 2021 13:35:01 -0500
Subject: [PATCH 05/18] FIX Remove cupy sparse deprecated imports

---
 python/cuml/common/sparsefuncs.py             |  2 +-
 python/cuml/feature_extraction/_tfidf.py      |  2 +-
 python/cuml/naive_bayes/naive_bayes.py        | 14 ++--
 python/cuml/test/test_adapters.py             |  4 +-
 python/cuml/test/test_nearest_neighbors.py    |  4 +-
 python/cuml/test/test_preprocessing.py        | 68 +++++++++----------
 python/cuml/test/test_thirdparty.py           |  8 +--
 .../thirdparty_adapters/sparsefuncs_fast.py   |  2 +-
 8 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/python/cuml/common/sparsefuncs.py b/python/cuml/common/sparsefuncs.py
index c870be97fa..b6fe00dac2 100644
--- a/python/cuml/common/sparsefuncs.py
+++ b/python/cuml/common/sparsefuncs.py
@@ -191,7 +191,7 @@ def extract_knn_graph(knn_graph, convert_dtype=True, sparse=False):
         csc_matrix = DummyClass
 
     if isinstance(knn_graph, (csc_matrix, cp_csc_matrix)):
-        knn_graph = cp.sparse.csr_matrix(knn_graph)
+        knn_graph = cupyx.scipy.sparse.csr_matrix(knn_graph)
         n_samples = knn_graph.shape[0]
         reordering = knn_graph.data.reshape((n_samples, -1))
         reordering = reordering.argsort()
diff --git a/python/cuml/feature_extraction/_tfidf.py b/python/cuml/feature_extraction/_tfidf.py
index d217e09c95..7b213bfc91 100644
--- a/python/cuml/feature_extraction/_tfidf.py
+++ b/python/cuml/feature_extraction/_tfidf.py
@@ -161,7 +161,7 @@ def _set_idf_diag(self):
         # log+1 instead of log makes sure terms with zero idf don't get
         # suppressed entirely.
         idf = cp.log(n_samples / df) + 1
-        self._idf_diag = cp.sparse.dia_matrix(
+        self._idf_diag = cupyx.scipy.sparse.dia_matrix(
             (idf, 0),
             shape=(self.__n_features, self.__n_features),
             dtype=df.dtype
diff --git a/python/cuml/naive_bayes/naive_bayes.py b/python/cuml/naive_bayes/naive_bayes.py
index 73ef26fbe2..d74d6a6642 100644
--- a/python/cuml/naive_bayes/naive_bayes.py
+++ b/python/cuml/naive_bayes/naive_bayes.py
@@ -356,7 +356,7 @@ def _partial_fit(self, X, y, _classes=None, _refit=False,
             raise ValueError("classes must be passed on the first call "
                              "to partial_fit.")
 
-        if scipy_sparse_isspmatrix(X) or cp.sparse.isspmatrix(X):
+        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
             X = _convert_x_sparse(X)
         else:
             X = input_to_cupy_array(X, order='K',
@@ -506,7 +506,7 @@ def _update_mean_variance(self, X, Y, sample_weight=None):
         new_var = cp.zeros((self.n_classes_, self.n_features_), order="F",
                            dtype=X.dtype)
         class_counts = cp.zeros(self.n_classes_, order="F", dtype=X.dtype)
-        if cp.sparse.isspmatrix(X):
+        if cupyx.scipy.sparse.isspmatrix(X):
             X = X.tocoo()
 
             count_features_coo = count_features_coo_kernel(X.dtype,
@@ -732,7 +732,7 @@ def _partial_fit(self, X, y, sample_weight=None,
                 as scipy_sparse_isspmatrix
 
         # TODO: use SparseCumlArray
-        if scipy_sparse_isspmatrix(X) or cp.sparse.isspmatrix(X):
+        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
             X = _convert_x_sparse(X)
         else:
             X = input_to_cupy_array(X, order='K',
@@ -771,7 +771,7 @@ def _partial_fit(self, X, y, sample_weight=None,
         else:
             check_labels(Y, self.classes_)
 
-        if cp.sparse.isspmatrix(X):
+        if cupyx.scipy.sparse.isspmatrix(X):
             # X is assumed to be a COO here
             self._count_sparse(X.row, X.col, X.data, X.shape, Y, self.classes_)
         else:
@@ -1007,7 +1007,7 @@ class MultinomialNB(_BaseDiscreteNB):
 
         # Put feature vectors and labels on the GPU
 
-        X = cp.sparse.csr_matrix(features.tocsr(), dtype=cp.float32)
+        X = cupyx.scipy.sparse.csr_matrix(features.tocsr(), dtype=cp.float32)
         y = cp.asarray(twenty_train.target, dtype=cp.int32)
 
         # Train model
@@ -1170,7 +1170,7 @@ def __init__(self, *, alpha=1.0, binarize=.0, fit_prior=True,
     def _check_X(self, X):
         X = super()._check_X(X)
         if self.binarize is not None:
-            if cp.sparse.isspmatrix(X):
+            if cupyx.scipy.sparse.isspmatrix(X):
                 X.data = binarize(X.data, threshold=self.binarize)
             else:
                 X = binarize(X, threshold=self.binarize)
@@ -1179,7 +1179,7 @@ def _check_X(self, X):
     def _check_X_y(self, X, y):
         X, y = super()._check_X_y(X, y)
         if self.binarize is not None:
-            if cp.sparse.isspmatrix(X):
+            if cupyx.scipy.sparse.isspmatrix(X):
                 X.data = binarize(X.data, threshold=self.binarize)
             else:
                 X = binarize(X, threshold=self.binarize)
diff --git a/python/cuml/test/test_adapters.py b/python/cuml/test/test_adapters.py
index 70435a58fe..0a059e8c6e 100644
--- a/python/cuml/test/test_adapters.py
+++ b/python/cuml/test/test_adapters.py
@@ -68,9 +68,9 @@ def sparse_random_dataset(request, random_seed):
                                   replace=False)
     X.ravel()[random_loc] = 0
     if request.param == 'cupy-csr':
-        X_sparse = cp.sparse.csr_matrix(X)
+        X_sparse = cupyx.scipy.sparse.csr_matrix(X)
     elif request.param == 'cupy-csc':
-        X_sparse = cp.sparse.csc_matrix(X)
+        X_sparse = cupyx.scipy.sparse.csc_matrix(X)
     return X.get(), X, X_sparse.get(), X_sparse
 
 
diff --git a/python/cuml/test/test_nearest_neighbors.py b/python/cuml/test/test_nearest_neighbors.py
index 66c7fb8f53..24a9f35a35 100644
--- a/python/cuml/test/test_nearest_neighbors.py
+++ b/python/cuml/test/test_nearest_neighbors.py
@@ -524,9 +524,9 @@ def test_nearest_neighbors_sparse(metric,
     if nrows == 1 and n_neighbors > 1:
         return
 
-    a = cp.sparse.random(nrows, ncols, format='csr', density=density,
+    a = cupyx.scipy.sparse.random(nrows, ncols, format='csr', density=density,
                          random_state=35)
-    b = cp.sparse.random(nrows, ncols, format='csr', density=density,
+    b = cupyx.scipy.sparse.random(nrows, ncols, format='csr', density=density,
                          random_state=38)
 
     if metric == 'jaccard':
diff --git a/python/cuml/test/test_preprocessing.py b/python/cuml/test/test_preprocessing.py
index dc5416b14b..59021e8916 100644
--- a/python/cuml/test/test_preprocessing.py
+++ b/python/cuml/test/test_preprocessing.py
@@ -139,12 +139,12 @@ def test_standard_scaler_sparse(failure_logger,
     r_X = scaler.inverse_transform(t_X)
     #  assert type(t_X) == type(X)
     #  assert type(r_X) == type(t_X)
-    if cp.sparse.issparse(X):
-        assert cp.sparse.issparse(t_X)
+    if cupyx.scipy.sparse.issparse(X):
+        assert cupyx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
-    if cp.sparse.issparse(t_X):
-        assert cp.sparse.issparse(r_X)
+    if cupyx.scipy.sparse.issparse(t_X):
+        assert cupyx.scipy.sparse.issparse(r_X)
     if scipy.sparse.issparse(t_X):
         assert scipy.sparse.issparse(r_X)
 
@@ -180,8 +180,8 @@ def test_scale_sparse(failure_logger, sparse_clf_dataset,  # noqa: F811
 
     t_X = cu_scale(X, with_mean=False, with_std=with_std, copy=True)
     #  assert type(t_X) == type(X)
-    if cp.sparse.issparse(X):
-        assert cp.sparse.issparse(t_X)
+    if cupyx.scipy.sparse.issparse(X):
+        assert cupyx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
 
@@ -230,12 +230,12 @@ def test_maxabs_scaler_sparse(failure_logger,
     r_X = scaler.inverse_transform(t_X)
     #  assert type(t_X) == type(X)
     #  assert type(r_X) == type(t_X)
-    if cp.sparse.issparse(X):
-        assert cp.sparse.issparse(t_X)
+    if cupyx.scipy.sparse.issparse(X):
+        assert cupyx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
-    if cp.sparse.issparse(t_X):
-        assert cp.sparse.issparse(r_X)
+    if cupyx.scipy.sparse.issparse(t_X):
+        assert cupyx.scipy.sparse.issparse(r_X)
     if scipy.sparse.issparse(t_X):
         assert scipy.sparse.issparse(r_X)
 
@@ -272,8 +272,8 @@ def test_normalizer_sparse(failure_logger, sparse_clf_dataset,  # noqa: F811
     normalizer = cuNormalizer(norm=norm, copy=True)
     t_X = normalizer.fit_transform(X)
     #  assert type(t_X) == type(X)
-    if cp.sparse.issparse(X):
-        assert cp.sparse.issparse(t_X)
+    if cupyx.scipy.sparse.issparse(X):
+        assert cupyx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
 
@@ -314,8 +314,8 @@ def test_normalize_sparse(failure_logger, sparse_clf_dataset,  # noqa: F811
 
     t_X = cu_normalize(X, axis=axis, norm=norm)
     #  assert type(t_X) == type(X)
-    if cp.sparse.issparse(X):
-        assert cp.sparse.issparse(t_X)
+    if cupyx.scipy.sparse.issparse(X):
+        assert cupyx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
 
@@ -369,8 +369,8 @@ def test_imputer_sparse(sparse_imputer_dataset,  # noqa: F811
                               strategy=strategy, fill_value=fill_value)
     t_X = imputer.fit_transform(X)
     #  assert type(t_X) == type(X)
-    if cp.sparse.issparse(X):
-        assert cp.sparse.issparse(t_X)
+    if cupyx.scipy.sparse.issparse(X):
+        assert cupyx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
 
@@ -423,8 +423,8 @@ def test_poly_features_sparse(failure_logger, sparse_clf_dataset,  # noqa: F811
                                         include_bias=include_bias)
     t_X = polyfeatures.fit_transform(X)
     #  assert type(t_X) == type(X)
-    if cp.sparse.issparse(X):
-        assert cp.sparse.issparse(t_X)
+    if cupyx.scipy.sparse.issparse(X):
+        assert cupyx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
 
@@ -455,8 +455,8 @@ def test_add_dummy_feature_sparse(failure_logger,
 
     t_X = cu_add_dummy_feature(X, value=value)
     #  assert type(t_X) == type(X)
-    if cp.sparse.issparse(X):
-        assert cp.sparse.issparse(t_X)
+    if cupyx.scipy.sparse.issparse(X):
+        assert cupyx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
 
@@ -483,8 +483,8 @@ def test_binarize_sparse(failure_logger, sparse_clf_dataset,  # noqa: F811
 
     t_X = cu_binarize(X, threshold=threshold, copy=True)
     #  assert type(t_X) == type(X)
-    if cp.sparse.issparse(X):
-        assert cp.sparse.issparse(t_X)
+    if cupyx.scipy.sparse.issparse(X):
+        assert cupyx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
 
@@ -515,8 +515,8 @@ def test_binarizer_sparse(failure_logger, sparse_clf_dataset,  # noqa: F811
     binarizer = cuBinarizer(threshold=threshold, copy=True)
     t_X = binarizer.fit_transform(X)
     #  assert type(t_X) == type(X)
-    if cp.sparse.issparse(X):
-        assert cp.sparse.issparse(t_X)
+    if cupyx.scipy.sparse.issparse(X):
+        assert cupyx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
 
@@ -570,12 +570,12 @@ def test_robust_scaler_sparse(failure_logger, sparse_clf_dataset,  # noqa: F811
     r_X = scaler.inverse_transform(t_X)
     #  assert type(t_X) == type(X)
     #  assert type(r_X) == type(t_X)
-    if cp.sparse.issparse(X):
-        assert cp.sparse.issparse(t_X)
+    if cupyx.scipy.sparse.issparse(X):
+        assert cupyx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
-    if cp.sparse.issparse(t_X):
-        assert cp.sparse.issparse(r_X)
+    if cupyx.scipy.sparse.issparse(t_X):
+        assert cupyx.scipy.sparse.issparse(r_X)
     if scipy.sparse.issparse(t_X):
         assert scipy.sparse.issparse(r_X)
 
@@ -632,8 +632,8 @@ def test_robust_scale_sparse(failure_logger, sparse_clf_dataset,  # noqa: F811
                           quantile_range=quantile_range,
                           copy=True)
     #  assert type(t_X) == type(X)
-    if cp.sparse.issparse(X):
-        assert cp.sparse.issparse(t_X)
+    if cupyx.scipy.sparse.issparse(X):
+        assert cupyx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
 
@@ -728,11 +728,11 @@ def test_missing_indicator_sparse(failure_logger,
                                    missing_values=1)
     ft_X = indicator.fit_transform(X)
     # assert type(ft_X) == type(X)
-    assert cp.sparse.issparse(ft_X) or scipy.sparse.issparse(ft_X)
+    assert cupyx.scipy.sparse.issparse(ft_X) or scipy.sparse.issparse(ft_X)
     indicator.fit(X)
     t_X = indicator.transform(X)
     # assert type(t_X) == type(X)
-    assert cp.sparse.issparse(t_X) or scipy.sparse.issparse(t_X)
+    assert cupyx.scipy.sparse.issparse(t_X) or scipy.sparse.issparse(t_X)
 
     indicator = skMissingIndicator(features=features,
                                    missing_values=1)
@@ -769,8 +769,8 @@ def test_function_transformer_sparse(sparse_clf_dataset):  # noqa: F811
                                         accept_sparse=True)
     t_X = transformer.fit_transform(X)
     r_X = transformer.inverse_transform(t_X)
-    assert cp.sparse.issparse(t_X) or scipy.sparse.issparse(t_X)
-    assert cp.sparse.issparse(r_X) or scipy.sparse.issparse(r_X)
+    assert cupyx.scipy.sparse.issparse(t_X) or scipy.sparse.issparse(t_X)
+    assert cupyx.scipy.sparse.issparse(r_X) or scipy.sparse.issparse(r_X)
 
     transformer = skFunctionTransformer(func=lambda x: x * 2,
                                         inverse_func=lambda x: x / 2,
diff --git a/python/cuml/test/test_thirdparty.py b/python/cuml/test/test_thirdparty.py
index 4ab2a135a1..24eeafa958 100644
--- a/python/cuml/test/test_thirdparty.py
+++ b/python/cuml/test/test_thirdparty.py
@@ -55,9 +55,9 @@ def sparse_random_dataset(request, random_seed):
     random_loc = cp.random.choice(X.size, int(X.size * 0.3), replace=False)
     X.ravel()[random_loc] = 0
     if request.param == 'cupy-csr':
-        X_sparse = cp.sparse.csr_matrix(X)
+        X_sparse = cupyx.scipy.sparse.csr_matrix(X)
     elif request.param == 'cupy-csc':
-        X_sparse = cp.sparse.csc_matrix(X)
+        X_sparse = cupyx.scipy.sparse.csc_matrix(X)
     return X.get(), X, X_sparse.get(), X_sparse
 
 
@@ -191,9 +191,9 @@ def sparse_extremes(request, random_seed):
        [0.0, 0.0, cp.nan],
        [0.0, cp.nan, cp.nan]])
     if request.param == 'cupy-csr':
-        X_sparse = cp.sparse.csr_matrix(X)
+        X_sparse = cupyx.scipy.sparse.csr_matrix(X)
     elif request.param == 'cupy-csc':
-        X_sparse = cp.sparse.csc_matrix(X)
+        X_sparse = cupyx.scipy.sparse.csc_matrix(X)
     return X_sparse.get(), X_sparse
 
 
diff --git a/python/cuml/thirdparty_adapters/sparsefuncs_fast.py b/python/cuml/thirdparty_adapters/sparsefuncs_fast.py
index 59bc699a34..01cc091f25 100644
--- a/python/cuml/thirdparty_adapters/sparsefuncs_fast.py
+++ b/python/cuml/thirdparty_adapters/sparsefuncs_fast.py
@@ -356,6 +356,6 @@ def csr_polynomial_expansion(X, interaction_only, degree):
                                 d, interaction_only, degree,
                                 expanded_indptr)
 
-    return cp.sparse.csr_matrix((expanded_data, expanded_indices,
+    return cupyx.scipy.sparse.csr_matrix((expanded_data, expanded_indices,
                                  expanded_indptr),
                                 shape=(num_rows, expanded_dimensionality))

From 8d3ae9911460c8869b393629b9deac49d65fd0a0 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Thu, 9 Sep 2021 15:25:52 -0500
Subject: [PATCH 06/18] FIX Multiple pytest warning fixes

---
 .../sklearn/preprocessing/_column_transformer.py         | 2 +-
 python/cuml/_thirdparty/sklearn/preprocessing/_data.py   | 2 +-
 .../_thirdparty/sklearn/preprocessing/_imputation.py     | 2 +-
 python/cuml/_thirdparty/sklearn/utils/extmath.py         | 2 +-
 python/cuml/_thirdparty/sklearn/utils/sparsefuncs.py     | 2 +-
 python/cuml/cluster/agglomerative.pyx                    | 6 +++---
 python/cuml/metrics/trustworthiness.pyx                  | 9 ++-------
 python/cuml/test/test_adapters.py                        | 5 +++--
 python/cuml/test/test_api.py                             | 2 ++
 python/cuml/test/test_base.py                            | 2 ++
 python/cuml/thirdparty_adapters/adapters.py              | 2 +-
 python/pytest.ini                                        | 1 +
 12 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_column_transformer.py b/python/cuml/_thirdparty/sklearn/preprocessing/_column_transformer.py
index be29025ebd..01c1912e5d 100644
--- a/python/cuml/_thirdparty/sklearn/preprocessing/_column_transformer.py
+++ b/python/cuml/_thirdparty/sklearn/preprocessing/_column_transformer.py
@@ -22,7 +22,7 @@
 import warnings
 
 from scipy import sparse as sp_sparse
-from cupy import sparse as cu_sparse
+from cupyx.scipy import sparse as cu_sparse
 import numpy as cpu_np
 import cupy as np
 import numba
diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_data.py b/python/cuml/_thirdparty/sklearn/preprocessing/_data.py
index 8f2b2fcb7b..ca42e28a69 100644
--- a/python/cuml/_thirdparty/sklearn/preprocessing/_data.py
+++ b/python/cuml/_thirdparty/sklearn/preprocessing/_data.py
@@ -22,7 +22,7 @@
 
 import numpy as cpu_np
 import cupy as np
-from cupy import sparse
+from cupyx.scipy import sparse
 from scipy import stats
 from scipy import optimize
 from scipy.special import boxcox
diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_imputation.py b/python/cuml/_thirdparty/sklearn/preprocessing/_imputation.py
index 35c571507c..b8ad6d0f7a 100644
--- a/python/cuml/_thirdparty/sklearn/preprocessing/_imputation.py
+++ b/python/cuml/_thirdparty/sklearn/preprocessing/_imputation.py
@@ -16,7 +16,7 @@
 import numpy
 import cupy as np
 import cuml
-from cupy import sparse
+from cupyx.scipy import sparse
 
 from ....thirdparty_adapters import (_get_mask,
                                      _masked_column_median,
diff --git a/python/cuml/_thirdparty/sklearn/utils/extmath.py b/python/cuml/_thirdparty/sklearn/utils/extmath.py
index 7d04ede66a..7203eca6e6 100644
--- a/python/cuml/_thirdparty/sklearn/utils/extmath.py
+++ b/python/cuml/_thirdparty/sklearn/utils/extmath.py
@@ -18,7 +18,7 @@
 
 import cupy as np
 import cupyx
-from cupy import sparse
+from cupyx.scipy import sparse
 
 
 def row_norms(X, squared=False):
diff --git a/python/cuml/_thirdparty/sklearn/utils/sparsefuncs.py b/python/cuml/_thirdparty/sklearn/utils/sparsefuncs.py
index c42687039e..bf7d81aaa3 100644
--- a/python/cuml/_thirdparty/sklearn/utils/sparsefuncs.py
+++ b/python/cuml/_thirdparty/sklearn/utils/sparsefuncs.py
@@ -13,7 +13,7 @@
 
 
 from scipy import sparse as cpu_sp
-from cupy import sparse as gpu_sp
+from cupyx.scipy import sparse as gpu_sp
 import cupy as np
 import numpy as cpu_np
 
diff --git a/python/cuml/cluster/agglomerative.pyx b/python/cuml/cluster/agglomerative.pyx
index df5ce36816..ea07a92650 100644
--- a/python/cuml/cluster/agglomerative.pyx
+++ b/python/cuml/cluster/agglomerative.pyx
@@ -135,9 +135,9 @@ class AgglomerativeClustering(Base, ClusterMixin, CMajorInputTagMixin):
                  handle=None, verbose=False, connectivity='knn',
                  n_neighbors=10, output_type=None):
 
-        super().__init__(handle,
-                         verbose,
-                         output_type)
+        super().__init__(handle=handle,
+                         verbose=verbose,
+                         output_type=output_type)
 
         if linkage is not "single":
             raise ValueError("Only single linkage clustering is "
diff --git a/python/cuml/metrics/trustworthiness.pyx b/python/cuml/metrics/trustworthiness.pyx
index 9bb907f81d..1dee80aced 100644
--- a/python/cuml/metrics/trustworthiness.pyx
+++ b/python/cuml/metrics/trustworthiness.pyx
@@ -54,8 +54,8 @@ def _get_array_ptr(obj):
 
 @cuml.internals.api_return_any()
 def trustworthiness(X, X_embedded, handle=None, n_neighbors=5,
-                    metric='euclidean', should_downcast=True,
-                    convert_dtype=False, batch_size=512) -> double:
+                    metric='euclidean',
+                    convert_dtype=True, batch_size=512) -> double:
     """
     Expresses to what extent the local structure is retained in embedding.
     The score is defined in the range [0, 1].
@@ -83,11 +83,6 @@ def trustworthiness(X, X_embedded, handle=None, n_neighbors=5,
             Trustworthiness of the low-dimensional embedding
     """
 
-    if should_downcast:
-        convert_dtype = True
-        warnings.warn("Parameter should_downcast is deprecated, use "
-                      "convert_dtype instead. ")
-
     handle = cuml.raft.common.handle.Handle() if handle is None else handle
 
     cdef uintptr_t d_X_ptr
diff --git a/python/cuml/test/test_adapters.py b/python/cuml/test/test_adapters.py
index 0a059e8c6e..b531deadd4 100644
--- a/python/cuml/test/test_adapters.py
+++ b/python/cuml/test/test_adapters.py
@@ -17,6 +17,7 @@
 import pytest
 
 import cupy as cp
+import cupyx as cpx
 import numpy as np
 from cupyx.scipy.sparse import coo_matrix
 from scipy import stats
@@ -68,9 +69,9 @@ def sparse_random_dataset(request, random_seed):
                                   replace=False)
     X.ravel()[random_loc] = 0
     if request.param == 'cupy-csr':
-        X_sparse = cupyx.scipy.sparse.csr_matrix(X)
+        X_sparse = cpx.scipy.sparse.csr_matrix(X)
     elif request.param == 'cupy-csc':
-        X_sparse = cupyx.scipy.sparse.csc_matrix(X)
+        X_sparse = cpx.scipy.sparse.csc_matrix(X)
     return X.get(), X, X_sparse.get(), X_sparse
 
 
diff --git a/python/cuml/test/test_api.py b/python/cuml/test/test_api.py
index 6ff27f7bea..c4efab811a 100644
--- a/python/cuml/test/test_api.py
+++ b/python/cuml/test/test_api.py
@@ -220,6 +220,8 @@ def test_mro(model):
 ###############################################################################
 
 @pytest.mark.parametrize("model_name", list(models.keys()))
+# ignore random forest float64 warnings
+@pytest.mark.filterwarnings("ignore:To use pickling or GPU-based")
 def test_fit_function(dataset, model_name):
     # This test ensures that our estimators return self after a call to fit
     if model_name in [
diff --git a/python/cuml/test/test_base.py b/python/cuml/test/test_base.py
index 433cc118f7..721169430e 100644
--- a/python/cuml/test/test_base.py
+++ b/python/cuml/test/test_base.py
@@ -145,6 +145,8 @@ def get_param_doc(param_doc_obj, name: str):
 
 
 @pytest.mark.parametrize('child_class', list(all_base_children.keys()))
+# ignore ColumnTransformer init warning
+@pytest.mark.filterwarnings("ignore:Transformers are required")
 def test_base_children_get_param_names(child_class: str):
 
     """
diff --git a/python/cuml/thirdparty_adapters/adapters.py b/python/cuml/thirdparty_adapters/adapters.py
index 9c369fbfed..8ea55a21c0 100644
--- a/python/cuml/thirdparty_adapters/adapters.py
+++ b/python/cuml/thirdparty_adapters/adapters.py
@@ -20,7 +20,7 @@
 from cupyx.scipy.sparse import csc_matrix as gpu_csc_matrix
 from cupyx.scipy.sparse import csc_matrix as gpu_coo_matrix
 from scipy import sparse as cpu_sparse
-from cupy import sparse as gpu_sparse
+from cupyx.scipy import sparse as gpu_sparse
 
 from pandas import DataFrame as pdDataFrame
 from cudf import DataFrame as cuDataFrame
diff --git a/python/pytest.ini b/python/pytest.ini
index e5d260c8b9..0d918c24d2 100644
--- a/python/pytest.ini
+++ b/python/pytest.ini
@@ -11,3 +11,4 @@ testpaths = cuml/test
 
 filterwarnings =
   error::FutureWarning:cuml[.*] # Catch uses of deprecated positional args in testing
+  ignore:[^.]*ABCs[^.]*:DeprecationWarning:patsy[.*]

From f50ba0b4d3b9d2f64b9573766a3255e04619ab5b Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Fri, 10 Sep 2021 11:44:06 -0500
Subject: [PATCH 07/18] FIX Multiple pytest warning fixes

---
 python/cuml/ensemble/randomforest_common.pyx  |  4 +-
 .../cuml/experimental/linear_model/lars.pyx   |  2 +-
 python/cuml/preprocessing/encoders.py         |  7 +-
 python/cuml/test/test_coordinate_descent.py   |  2 +
 python/cuml/test/test_cuml_descr_decor.py     | 18 ++---
 python/cuml/test/test_hdbscan.py              | 12 ++--
 python/cuml/test/test_holtwinters.py          |  6 +-
 .../cuml/test/test_kneighbors_classifier.py   |  2 +-
 python/cuml/test/test_linear_model.py         | 16 ++++-
 python/cuml/test/test_make_classification.py  |  2 +-
 python/cuml/test/test_mbsgd_classifier.py     |  1 +
 python/cuml/test/test_mbsgd_regressor.py      |  3 +-
 python/cuml/test/test_metrics.py              | 69 +++++++++++--------
 python/cuml/test/test_nearest_neighbors.py    | 11 +--
 python/cuml/test/test_one_hot_encoder.py      |  2 +
 python/cuml/test/test_pickle.py               |  6 +-
 python/pytest.ini                             |  1 +
 17 files changed, 100 insertions(+), 64 deletions(-)

diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index 1bba0d37a1..ee0f1d7f0b 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -41,13 +41,13 @@ from cuml.common.array_descriptor import CumlArrayDescriptor
 class BaseRandomForestModel(Base):
     _param_names = ['n_estimators', 'max_depth', 'handle',
                     'max_features', 'n_bins',
-                    'split_algo', 'split_criterion', 'min_samples_leaf',
+                    'split_criterion', 'min_samples_leaf',
                     'min_samples_split',
                     'min_impurity_decrease',
                     'bootstrap',
                     'verbose', 'max_samples',
                     'max_leaves',
-                    'accuracy_metric', 'use_experimental_backend',
+                    'accuracy_metric',
                     'max_batch_size', 'n_streams', 'dtype',
                     'output_type', 'min_weight_fraction_leaf', 'n_jobs',
                     'max_leaf_nodes', 'min_impurity_split', 'oob_score',
diff --git a/python/cuml/experimental/linear_model/lars.pyx b/python/cuml/experimental/linear_model/lars.pyx
index bf98368421..848350c741 100644
--- a/python/cuml/experimental/linear_model/lars.pyx
+++ b/python/cuml/experimental/linear_model/lars.pyx
@@ -317,7 +317,7 @@ class Lars(Base, RegressorMixin):
             X = cp.copy(X)
 
         if self.eps is None:
-            self.eps = np.finfo(np.float).eps
+            self.eps = np.finfo(float).eps
 
         self._fit_cpp(X, y, Gram, x_scale)
 
diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py
index 77676080ef..af6f32e5cd 100644
--- a/python/cuml/preprocessing/encoders.py
+++ b/python/cuml/preprocessing/encoders.py
@@ -347,7 +347,7 @@ def transform(self, X):
 
                 if self.drop_idx_ is not None:
                     drop_idx = self.drop_idx_[feature] + j
-                    mask = cp.ones(col_idx.shape, dtype=cp.bool)
+                    mask = cp.ones(col_idx.shape, dtype=bool)
                     mask[col_idx == drop_idx] = False
                     col_idx = col_idx[mask]
                     row_idx = row_idx[mask]
@@ -454,8 +454,9 @@ def inverse_transform(self, X):
                 result = cp.asarray(result.as_gpu_matrix())
             except ValueError:
                 warnings.warn("The input one hot encoding contains rows with "
-                              "unknown categories. Arrays do not support null "
-                              "values. Returning output as a DataFrame "
+                              "unknown categories. Since device arrays do not "
+                              "support null values, the output will be "
+                              "returned as a DataFrame "
                               "instead.")
         return result
 
diff --git a/python/cuml/test/test_coordinate_descent.py b/python/cuml/test/test_coordinate_descent.py
index 92149aa260..969aa1b913 100644
--- a/python/cuml/test/test_coordinate_descent.py
+++ b/python/cuml/test/test_coordinate_descent.py
@@ -35,6 +35,7 @@
 @pytest.mark.parametrize('column_info', [unit_param([20, 10]),
                          quality_param([100, 50]),
                          stress_param([1000, 500])])
+@pytest.mark.filterwarnings("ignore:Objective did not converge::sklearn[.*]")
 def test_lasso(datatype, X_type, alpha, algorithm,
                nrows, column_info):
     ncols, n_info = column_info
@@ -103,6 +104,7 @@ def test_lasso_default(datatype, nrows, column_info):
 @pytest.mark.parametrize('column_info', [unit_param([20, 10]),
                          quality_param([100, 50]),
                          stress_param([1000, 500])])
+@pytest.mark.filterwarnings("ignore:Objective did not converge::sklearn[.*]")
 def test_elastic_net(datatype, X_type, alpha, algorithm,
                      nrows, column_info):
     ncols, n_info = column_info
diff --git a/python/cuml/test/test_cuml_descr_decor.py b/python/cuml/test/test_cuml_descr_decor.py
index 55b0a531fe..24ee781bef 100644
--- a/python/cuml/test/test_cuml_descr_decor.py
+++ b/python/cuml/test/test_cuml_descr_decor.py
@@ -41,7 +41,7 @@
 test_shapes = [10, (10, 1), (10, 5), (1, 10)]
 
 
-class TestEstimator(cuml.Base):
+class DummyTestEstimator(cuml.Base):
 
     input_any_ = CumlArrayDescriptor()
 
@@ -57,7 +57,7 @@ def get_input(self):
         return self.input_any_
 
     # === Standard Functions ===
-    def fit(self, X, convert_dtype=True) -> "TestEstimator":
+    def fit(self, X, convert_dtype=True) -> "DummyTestEstimator":
 
         return self
 
@@ -115,7 +115,7 @@ def test_pickle(input_type):
     if (input_type == "numba"):
         pytest.skip("numba arrays cant be picked at this time")
 
-    est = TestEstimator()
+    est = DummyTestEstimator()
 
     X_in = create_input(input_type, np.float32, (10, 5), "C")
 
@@ -128,7 +128,7 @@ def test_pickle(input_type):
                                    create_output(X_in, out_type))
 
     est_pickled_bytes = pickle.dumps(est)
-    est_unpickled: TestEstimator = pickle.loads(est_pickled_bytes)
+    est_unpickled: DummyTestEstimator = pickle.loads(est_pickled_bytes)
 
     # Assert that we only resture the input
     assert est_unpickled.__dict__["input_any_"].input_type == input_type
@@ -163,7 +163,7 @@ def test_dec_input_output(input_type, input_dtype, input_shape, output_type):
     X_out = create_output(X_in, output_type)
 
     # Test with output_type="input"
-    est = TestEstimator(output_type="input")
+    est = DummyTestEstimator(output_type="input")
 
     est.store_input(X_in)
 
@@ -185,7 +185,7 @@ def test_dec_input_output(input_type, input_dtype, input_shape, output_type):
         assert array_identical(est.input_any_, X_out)
 
     # Now Test with output_type=output_type
-    est = TestEstimator(output_type=output_type)
+    est = DummyTestEstimator(output_type=output_type)
 
     est.store_input(X_in)
 
@@ -211,7 +211,7 @@ def test_auto_fit(input_type, input_dtype, input_shape):
     X_in = create_input(input_type, input_dtype, input_shape, "C")
 
     # Test with output_type="input"
-    est = TestEstimator()
+    est = DummyTestEstimator()
 
     est.fit(X_in)
 
@@ -243,7 +243,7 @@ def test_auto_predict(input_type, base_output_type, global_output_type):
     X_in = create_input(input_type, np.float32, (10, 10), "F")
 
     # Test with output_type="input"
-    est = TestEstimator()
+    est = DummyTestEstimator()
 
     # With cuml.global_settings.output_type == None, this should return the
     # input type
@@ -254,7 +254,7 @@ def test_auto_predict(input_type, base_output_type, global_output_type):
     assert array_identical(X_in, X_out)
 
     # Test with output_type=base_output_type
-    est = TestEstimator(output_type=base_output_type)
+    est = DummyTestEstimator(output_type=base_output_type)
 
     # With cuml.global_settings.output_type == None, this should return the
     # base_output_type
diff --git a/python/cuml/test/test_hdbscan.py b/python/cuml/test/test_hdbscan.py
index 6d898333d5..c1a7af0515 100644
--- a/python/cuml/test/test_hdbscan.py
+++ b/python/cuml/test/test_hdbscan.py
@@ -150,9 +150,9 @@ def test_hdbscan_blobs(nrows, ncols, nclusters,
                        max_cluster_size,
                        min_samples):
 
-    X, y = make_blobs(int(nrows),
-                      ncols,
-                      nclusters,
+    X, y = make_blobs(n_samples=int(nrows),
+                      n_features=ncols,
+                      centers=nclusters,
                       cluster_std=0.7,
                       shuffle=False,
                       random_state=42)
@@ -429,9 +429,9 @@ def test_hdbscan_empty_cluster_tree():
 
 def test_hdbscan_plots():
 
-    X, y = make_blobs(int(100),
-                      100,
-                      10,
+    X, y = make_blobs(n_samples=int(100),
+                      n_features=100,
+                      centers=10,
                       cluster_std=0.7,
                       shuffle=False,
                       random_state=42)
diff --git a/python/cuml/test/test_holtwinters.py b/python/cuml/test/test_holtwinters.py
index d9d40c0867..71fba7e8bb 100644
--- a/python/cuml/test/test_holtwinters.py
+++ b/python/cuml/test/test_holtwinters.py
@@ -75,7 +75,9 @@ def test_singlets_holtwinters(seasonal, h, datatype):
     if seasonal == "multiplicative":
         pytest.xfail("Statsmodels nan errors with gcc 9.3 (Issue #3384)")
 
-    sm_hw = sm_ES(train, seasonal=seasonal,
+    sm_hw = sm_ES(train,
+                  initialization_method='heuristic',
+                  seasonal=seasonal,
                   seasonal_periods=12)
     sm_hw = sm_hw.fit()
 
@@ -113,9 +115,11 @@ def test_multits_holtwinters(seasonal, h, datatype):
                     seasonal_periods=12, ts_num=2)
 
     sm_air_hw = sm_ES(air_train,
+                      initialization_method='heuristic',
                       seasonal=seasonal,
                       seasonal_periods=12)
     sm_co2_hw = sm_ES(co2_train,
+                      initialization_method='heuristic',
                       seasonal=seasonal,
                       seasonal_periods=12)
     cu_hw.fit()
diff --git a/python/cuml/test/test_kneighbors_classifier.py b/python/cuml/test/test_kneighbors_classifier.py
index 5caf7cb721..3dd5b08510 100644
--- a/python/cuml/test/test_kneighbors_classifier.py
+++ b/python/cuml/test/test_kneighbors_classifier.py
@@ -218,7 +218,7 @@ def test_predict_non_gaussian(n_samples, n_features, n_neighbors, n_query):
     X_device_test = cudf.DataFrame.from_pandas(X_host_test)
 
     knn_sk = skKNN(algorithm="brute", n_neighbors=n_neighbors, n_jobs=1)
-    knn_sk.fit(X_host_train, y_host_train)
+    knn_sk.fit(X_host_train, y_host_train.values.ravel())
 
     sk_result = knn_sk.predict(X_host_test)
 
diff --git a/python/cuml/test/test_linear_model.py b/python/cuml/test/test_linear_model.py
index 6c43de7554..f2cefb7bbf 100644
--- a/python/cuml/test/test_linear_model.py
+++ b/python/cuml/test/test_linear_model.py
@@ -133,7 +133,8 @@ def test_linear_regression_single_column():
     '''Test that linear regression can be run on single column with more than
     46340 rows (a limitation on CUDA <11)'''
     model = cuLinearRegression()
-    model.fit(cp.random.rand(46341), cp.random.rand(46341))
+    with pytest.warns(UserWarning):
+        model.fit(cp.random.rand(46341), cp.random.rand(46341))
 
 
 @pytest.mark.parametrize("datatype", [np.float32, np.float64])
@@ -239,6 +240,9 @@ def test_ridge_regression_model(datatype, algorithm, nrows, column_info):
 )
 @pytest.mark.parametrize("nrows", [unit_param(1000)])
 @pytest.mark.parametrize("column_info", [unit_param([20, 10])])
+# ignoring UserWarnings in sklearn about setting unused parameters
+# like l1 for none penalty
+@pytest.mark.filterwarnings("ignore::UserWarning:sklearn[.*]")
 def test_logistic_regression(
     num_classes, dtype, penalty, l1_ratio,
     fit_intercept, nrows, column_info, C, tol
@@ -257,9 +261,13 @@ def test_logistic_regression(
     )
     y_train = y_train.astype(dtype)
     y_test = y_test.astype(dtype)
+
     culog = cuLog(
-        penalty=penalty, l1_ratio=l1_ratio, C=C,
-        fit_intercept=fit_intercept, tol=tol
+        penalty=penalty,
+        l1_ratio=l1_ratio,
+        C=C,
+        fit_intercept=fit_intercept,
+        tol=tol
     )
     culog.fit(X_train, y_train)
 
@@ -633,6 +641,8 @@ def test_logistic_regression_weighting(regression_dataset,
 
 
 @pytest.mark.parametrize('algo', [cuLog, cuRidge])
+# ignoring warning about change of solver
+@pytest.mark.filterwarnings("ignore::UserWarning:cuml[.*]")
 def test_linear_models_set_params(algo):
     x = np.linspace(0, 1, 50)
     y = 2 * x
diff --git a/python/cuml/test/test_make_classification.py b/python/cuml/test/test_make_classification.py
index c704a41c1a..f715261c00 100644
--- a/python/cuml/test/test_make_classification.py
+++ b/python/cuml/test/test_make_classification.py
@@ -68,7 +68,7 @@ def test_make_classification_informative_features():
                                                          (2, [1/2] * 2, 2),
                                                          (2, [3/4, 1/4], 2),
                                                          (10, [1/3] * 3, 10),
-                                                         (np.int(64), [1], 1)
+                                                         (int(64), [1], 1)
                                                          ]:
         n_classes = len(weights)
         n_clusters = n_classes * n_clusters_per_class
diff --git a/python/cuml/test/test_mbsgd_classifier.py b/python/cuml/test/test_mbsgd_classifier.py
index 4d802df6bb..24a2d0876f 100644
--- a/python/cuml/test/test_mbsgd_classifier.py
+++ b/python/cuml/test/test_mbsgd_classifier.py
@@ -61,6 +61,7 @@ def make_dataset(request):
         ('constant', 'elasticnet', 'hinge'),
     ]
 )
+@pytest.mark.filterwarnings("ignore:Maximum::sklearn[.*]")
 def test_mbsgd_classifier_vs_skl(lrate, penalty, loss, make_dataset):
     nrows, X_train, X_test, y_train, y_test = make_dataset
 
diff --git a/python/cuml/test/test_mbsgd_regressor.py b/python/cuml/test/test_mbsgd_regressor.py
index d21f0e8f17..ee9aacaa55 100644
--- a/python/cuml/test/test_mbsgd_regressor.py
+++ b/python/cuml/test/test_mbsgd_regressor.py
@@ -64,6 +64,7 @@ def make_dataset(request):
         ('constant', 'elasticnet'),
     ]
 )
+@pytest.mark.filterwarnings("ignore:Maximum::sklearn[.*]")
 def test_mbsgd_regressor_vs_skl(lrate, penalty, make_dataset):
     nrows, datatype, X_train, X_test, y_train, y_test = make_dataset
 
@@ -85,7 +86,7 @@ def test_mbsgd_regressor_vs_skl(lrate, penalty, make_dataset):
                                          tol=0.0, penalty=penalty,
                                          random_state=0)
 
-        skl_sgd_regressor.fit(cp.asnumpy(X_train), cp.asnumpy(y_train))
+        skl_sgd_regressor.fit(cp.asnumpy(X_train), cp.asnumpy(y_train).ravel())
         skl_pred = skl_sgd_regressor.predict(cp.asnumpy(X_test))
         skl_r2 = r2_score(skl_pred, cp.asnumpy(y_test),
                           convert_dtype=datatype)
diff --git a/python/cuml/test/test_metrics.py b/python/cuml/test/test_metrics.py
index bb18ecddd4..04ca6e97ff 100644
--- a/python/cuml/test/test_metrics.py
+++ b/python/cuml/test/test_metrics.py
@@ -13,6 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+import pytest
+
 import random
 from itertools import chain, permutations
 from functools import partial
@@ -22,7 +25,6 @@
 import cupy as cp
 import cupyx
 import numpy as np
-import pytest
 import cudf
 
 from cuml.ensemble import RandomForestClassifier as curfc
@@ -186,9 +188,12 @@ def test_accuracy(nrows, ncols, n_info, datatype):
     # Initialize, fit and predict using cuML's
     # random forest classification model
     cuml_model = curfc(max_features=1.0,
-                       n_bins=8, split_algo=0, split_criterion=0,
+                       n_bins=8,
+                       split_criterion=0,
                        min_samples_leaf=2,
-                       n_estimators=40, handle=handle, max_leaves=-1,
+                       n_estimators=40,
+                       handle=handle,
+                       max_leaves=-1,
                        max_depth=16)
 
     cuml_model.fit(X_train, y_train)
@@ -464,7 +469,7 @@ def test_completeness_score_big_array(use_handle, input_range):
 
 
 def test_regression_metrics():
-    y_true = np.arange(50, dtype=np.int)
+    y_true = np.arange(50, dtype=int)
     y_pred = y_true + 1
     assert_almost_equal(mean_squared_error(y_true, y_pred), 1.)
     assert_almost_equal(mean_squared_log_error(y_true, y_pred),
@@ -497,8 +502,8 @@ def test_regression_metrics_random(n_samples, dtype, function):
 
 @pytest.mark.parametrize('function', ['mse', 'mse_not_squared', 'mae', 'msle'])
 def test_regression_metrics_at_limits(function):
-    y_true = np.array([0.], dtype=np.float)
-    y_pred = np.array([0.], dtype=np.float)
+    y_true = np.array([0.], dtype=float)
+    y_pred = np.array([0.], dtype=float)
 
     cuml_reg = {
         'mse': mean_squared_error,
@@ -538,8 +543,8 @@ def test_multioutput_regression():
 
 
 def test_regression_metrics_multioutput_array():
-    y_true = np.array([[1, 2], [2.5, -1], [4.5, 3], [5, 7]], dtype=np.float)
-    y_pred = np.array([[1, 1], [2, -1], [5, 4], [5, 6.5]], dtype=np.float)
+    y_true = np.array([[1, 2], [2.5, -1], [4.5, 3], [5, 7]], dtype=float)
+    y_pred = np.array([[1, 1], [2, -1], [5, 4], [5, 6.5]], dtype=float)
 
     mse = mean_squared_error(y_true, y_pred, multioutput='raw_values')
     mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
@@ -547,15 +552,15 @@ def test_regression_metrics_multioutput_array():
     cp.testing.assert_array_almost_equal(mse, [0.125, 0.5625], decimal=2)
     cp.testing.assert_array_almost_equal(mae, [0.25, 0.625], decimal=2)
 
-    weights = np.array([0.4, 0.6], dtype=np.float)
+    weights = np.array([0.4, 0.6], dtype=float)
     msew = mean_squared_error(y_true, y_pred, multioutput=weights)
     rmsew = mean_squared_error(y_true, y_pred, multioutput=weights,
                                squared=False)
     assert_almost_equal(msew, 0.39, decimal=2)
     assert_almost_equal(rmsew, 0.62, decimal=2)
 
-    y_true = np.array([[0, 0]] * 4, dtype=np.int)
-    y_pred = np.array([[1, 1]] * 4, dtype=np.int)
+    y_true = np.array([[0, 0]] * 4, dtype=int)
+    y_pred = np.array([[1, 1]] * 4, dtype=int)
     mse = mean_squared_error(y_true, y_pred, multioutput='raw_values')
     mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
     cp.testing.assert_array_almost_equal(mse, [1., 1.], decimal=2)
@@ -571,9 +576,9 @@ def test_regression_metrics_multioutput_array():
 
 @pytest.mark.parametrize('function', ['mse', 'mae'])
 def test_regression_metrics_custom_weights(function):
-    y_true = np.array([1, 2, 2.5, -1], dtype=np.float)
-    y_pred = np.array([1, 1, 2, -1], dtype=np.float)
-    weights = np.array([0.2, 0.25, 0.4, 0.15], dtype=np.float)
+    y_true = np.array([1, 2, 2.5, -1], dtype=float)
+    y_pred = np.array([1, 1, 2, -1], dtype=float)
+    weights = np.array([0.2, 0.25, 0.4, 0.15], dtype=float)
 
     cuml_reg, sklearn_reg = {
         'mse': (mean_squared_error, sklearn_mse),
@@ -586,9 +591,9 @@ def test_regression_metrics_custom_weights(function):
 
 
 def test_mse_vs_msle_custom_weights():
-    y_true = np.array([0.5, 2, 7, 6], dtype=np.float)
-    y_pred = np.array([0.5, 1, 8, 8], dtype=np.float)
-    weights = np.array([0.2, 0.25, 0.4, 0.15], dtype=np.float)
+    y_true = np.array([0.5, 2, 7, 6], dtype=float)
+    y_pred = np.array([0.5, 1, 8, 8], dtype=float)
+    weights = np.array([0.2, 0.25, 0.4, 0.15], dtype=float)
     msle = mean_squared_log_error(y_true, y_pred, sample_weight=weights)
     msle2 = mean_squared_error(np.log(1 + y_true), np.log(1 + y_pred),
                                sample_weight=weights)
@@ -741,8 +746,8 @@ def test_roc_auc_score_random(n_samples, dtype):
 
 
 def test_roc_auc_score_at_limits():
-    y_true = np.array([0., 0., 0.], dtype=np.float)
-    y_pred = np.array([0., 0.5, 1.], dtype=np.float)
+    y_true = np.array([0., 0., 0.], dtype=float)
+    y_pred = np.array([0., 0.5, 1.], dtype=float)
 
     err_msg = ("roc_auc_score cannot be used when "
                "only one class present in y_true. ROC AUC score "
@@ -751,8 +756,8 @@ def test_roc_auc_score_at_limits():
     with pytest.raises(ValueError, match=err_msg):
         roc_auc_score(y_true, y_pred)
 
-    y_true = np.array([0., 0.5, 1.0], dtype=np.float)
-    y_pred = np.array([0., 0.5, 1.], dtype=np.float)
+    y_true = np.array([0., 0.5, 1.0], dtype=float)
+    y_pred = np.array([0., 0.5, 1.], dtype=float)
 
     err_msg = ("Continuous format of y_true  "
                "is not supported.")
@@ -777,8 +782,8 @@ def test_precision_recall_curve():
 
 
 def test_precision_recall_curve_at_limits():
-    y_true = np.array([0., 0., 0.], dtype=np.float)
-    y_pred = np.array([0., 0.5, 1.], dtype=np.float)
+    y_true = np.array([0., 0., 0.], dtype=float)
+    y_pred = np.array([0., 0.5, 1.], dtype=float)
 
     err_msg = ("precision_recall_curve cannot be used when "
                "y_true is all zero.")
@@ -786,8 +791,8 @@ def test_precision_recall_curve_at_limits():
     with pytest.raises(ValueError, match=err_msg):
         precision_recall_curve(y_true, y_pred)
 
-    y_true = np.array([0., 0.5, 1.0], dtype=np.float)
-    y_pred = np.array([0., 0.5, 1.], dtype=np.float)
+    y_true = np.array([0., 0.5, 1.0], dtype=float)
+    y_pred = np.array([0., 0.5, 1.], dtype=float)
 
     err_msg = ("Continuous format of y_true  "
                "is not supported.")
@@ -845,8 +850,8 @@ def test_log_loss_random(n_samples, dtype):
 
 
 def test_log_loss_at_limits():
-    y_true = np.array([0., 1., 2.], dtype=np.float)
-    y_pred = np.array([0., 0.5, 1.], dtype=np.float)
+    y_true = np.array([0., 1., 2.], dtype=float)
+    y_pred = np.array([0., 0.5, 1.], dtype=float)
 
     err_msg = ("The shape of y_pred doesn't "
                "match the number of classes")
@@ -854,8 +859,8 @@ def test_log_loss_at_limits():
     with pytest.raises(ValueError, match=err_msg):
         log_loss(y_true, y_pred)
 
-    y_true = np.array([0., 0.5, 1.0], dtype=np.float)
-    y_pred = np.array([0., 0.5, 1.], dtype=np.float)
+    y_true = np.array([0., 0.5, 1.0], dtype=float)
+    y_pred = np.array([0., 0.5, 1.], dtype=float)
 
     err_msg = ("'y_true' can only have integer values")
     with pytest.raises(ValueError, match=err_msg):
@@ -1180,6 +1185,8 @@ def ref_sparse_pairwise_dist(X, Y=None, metric=None):
 @pytest.mark.parametrize("matrix_size, density", [
     ((3, 3), 0.7),
     ((5, 40), 0.2)])
+# ignoring boolean conversion warning for both cuml and sklearn
+@pytest.mark.filterwarnings("ignore:(.*)converted(.*)::")
 def test_sparse_pairwise_distances_corner_cases(metric: str, matrix_size,
                                                 density: float):
     # Test the sparse_pairwise_distance helper function.
@@ -1244,7 +1251,7 @@ def test_sparse_pairwise_distances_exceptions():
     X_int = sparse.random(5, 4, dtype=np.float32,
                           random_state=123, density=0.3) * 10
     X_int.dtype = cp.int32
-    X_bool = sparse.random(5, 4, dtype=cp.bool,
+    X_bool = sparse.random(5, 4, dtype=bool,
                            random_state=123, density=0.3)
     X_double = cupyx.scipy.sparse.random(5, 4, dtype=cp.float64,
                                          random_state=123, density=0.3)
@@ -1286,6 +1293,8 @@ def test_sparse_pairwise_distances_exceptions():
     unit_param((20, 10000), 0.01),
     quality_param((2000, 1000), 0.05),
     stress_param((10000, 10000), 0.01)])
+# ignoring boolean conversion warning for both cuml and sklearn
+@pytest.mark.filterwarnings("ignore:(.*)converted(.*)::")
 def test_sparse_pairwise_distances_sklearn_comparison(metric: str, matrix_size,
                                                       density: float):
     # Test larger sizes to sklearn
diff --git a/python/cuml/test/test_nearest_neighbors.py b/python/cuml/test/test_nearest_neighbors.py
index 24a9f35a35..3a33e1c94b 100644
--- a/python/cuml/test/test_nearest_neighbors.py
+++ b/python/cuml/test/test_nearest_neighbors.py
@@ -15,6 +15,8 @@
 #
 
 import pytest
+pytestmark = pytest.mark.filterwarnings("ignore:((.|\n)*)#4020((.|\n)*):UserWarning:cuml[.*]")
+
 import math
 
 from cuml.test.utils import array_equal, unit_param, quality_param, \
@@ -475,26 +477,26 @@ def test_knn_graph(input_type, mode, output_type, as_instance,
                       n_features=n_feats, random_state=0)
 
     if as_instance:
-        sparse_sk = sklearn.neighbors.kneighbors_graph(X.get(), k, mode,
+        sparse_sk = sklearn.neighbors.kneighbors_graph(X.get(), k, mode=mode,
                                                        metric=metric, p=p,
                                                        include_self='auto')
     else:
         knn_sk = skKNN(metric=metric, p=p)
         knn_sk.fit(X.get())
-        sparse_sk = knn_sk.kneighbors_graph(X.get(), k, mode)
+        sparse_sk = knn_sk.kneighbors_graph(X.get(), k, mode=mode)
 
     if input_type == "dataframe":
         X = cudf.DataFrame(X)
 
     with cuml.using_output_type(output_type):
         if as_instance:
-            sparse_cu = cuml.neighbors.kneighbors_graph(X, k, mode,
+            sparse_cu = cuml.neighbors.kneighbors_graph(X, k, mode=mode,
                                                         metric=metric, p=p,
                                                         include_self='auto')
         else:
             knn_cu = cuKNN(metric=metric, p=p)
             knn_cu.fit(X)
-            sparse_cu = knn_cu.kneighbors_graph(X, k, mode)
+            sparse_cu = knn_cu.kneighbors_graph(X, k, mode=mode)
 
     assert np.array_equal(sparse_sk.data.shape, sparse_cu.data.shape)
     assert np.array_equal(sparse_sk.indices.shape, sparse_cu.indices.shape)
@@ -514,6 +516,7 @@ def test_knn_graph(input_type, mode, output_type, as_instance,
      (10, 35, 0.8, 4, 10, 20000),
      (40, 35, 0.5, 4, 20000, 10),
      (35, 35, 0.8, 4, 20000, 20000)])
+@pytest.mark.filterwarnings("ignore:(.*)converted(.*)::")
 def test_nearest_neighbors_sparse(metric,
                                   nrows,
                                   ncols,
diff --git a/python/cuml/test/test_one_hot_encoder.py b/python/cuml/test/test_one_hot_encoder.py
index 344dc1a7db..6a5045504b 100644
--- a/python/cuml/test/test_one_hot_encoder.py
+++ b/python/cuml/test/test_one_hot_encoder.py
@@ -129,6 +129,7 @@ def test_onehot_categories(as_array):
 
 
 @pytest.mark.parametrize('as_array', [True, False], ids=['cupy', 'cudf'])
+@pytest.mark.filterwarnings("ignore:((.|\n)*)unknown((.|\n)*):UserWarning:cuml[.*]")
 def test_onehot_fit_handle_unknown(as_array):
     X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
     Y = DataFrame({'chars': ['c', 'b'], 'int': [0, 2]})
@@ -166,6 +167,7 @@ def test_onehot_transform_handle_unknown(as_array):
 
 
 @pytest.mark.parametrize('as_array', [True, False], ids=['cupy', 'cudf'])
+@pytest.mark.filterwarnings("ignore:((.|\n)*)unknown((.|\n)*):UserWarning:cuml[.*]")
 def test_onehot_inverse_transform_handle_unknown(as_array):
     X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
     Y_ohe = cp.array([[0., 0., 1., 0.],
diff --git a/python/cuml/test/test_pickle.py b/python/cuml/test/test_pickle.py
index d961642a60..b022f6eb75 100644
--- a/python/cuml/test/test_pickle.py
+++ b/python/cuml/test/test_pickle.py
@@ -324,7 +324,7 @@ def create_mod():
 
         result["umap"] = trustworthiness(X_train,
                                          cu_before_pickle_transform,
-                                         n_neighbors)
+                                         n_neighbors=n_neighbors)
         return model, X_train
 
     def assert_model(pickled_model, X_train):
@@ -335,7 +335,7 @@ def assert_model(pickled_model, X_train):
 
         cu_trust_after = trustworthiness(X_train,
                                          pickled_model.transform(X_train),
-                                         n_neighbors)
+                                         n_neighbors=n_neighbors)
         assert cu_trust_after >= result["umap"] - 0.2
 
     pickle_save_load(tmpdir, create_mod, assert_model)
@@ -366,6 +366,7 @@ def assert_model(pickled_model, X_test):
 
 @pytest.mark.parametrize('model_name',
                          all_models.keys())
+@pytest.mark.filterwarnings("ignore:Transformers((.|\n)*):UserWarning:cuml[.*]")
 def test_unfit_pickle(model_name):
     # Any model xfailed in this test cannot be used for hyperparameter sweeps
     # with dask or sklearn
@@ -382,6 +383,7 @@ def test_unfit_pickle(model_name):
 
 @pytest.mark.parametrize('model_name',
                          all_models.keys())
+@pytest.mark.filterwarnings("ignore:((.|\n)*)unknown((.|\n)*):UserWarning:cuml[.*]")
 def test_unfit_clone(model_name):
     if model_name in unfit_clone_xfail:
         pytest.xfail()
diff --git a/python/pytest.ini b/python/pytest.ini
index 0d918c24d2..709406abff 100644
--- a/python/pytest.ini
+++ b/python/pytest.ini
@@ -12,3 +12,4 @@ testpaths = cuml/test
 filterwarnings =
   error::FutureWarning:cuml[.*] # Catch uses of deprecated positional args in testing
   ignore:[^.]*ABCs[^.]*:DeprecationWarning:patsy[.*]
+  ignore:(.*)alias(.*):DeprecationWarning:hdbscan[.*]

From 68b27e8740b9fde85848be5d035fd826627b0dd5 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Fri, 10 Sep 2021 11:58:59 -0500
Subject: [PATCH 08/18] FIX Multiple pytest warning fixes

---
 python/cuml/test/test_pickle.py               |  7 +-
 python/cuml/test/test_preprocessing.py        | 69 ++++++++++---------
 .../thirdparty_adapters/sparsefuncs_fast.py   |  3 +-
 3 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/python/cuml/test/test_pickle.py b/python/cuml/test/test_pickle.py
index b022f6eb75..0f3da56346 100644
--- a/python/cuml/test/test_pickle.py
+++ b/python/cuml/test/test_pickle.py
@@ -383,7 +383,7 @@ def test_unfit_pickle(model_name):
 
 @pytest.mark.parametrize('model_name',
                          all_models.keys())
-@pytest.mark.filterwarnings("ignore:((.|\n)*)unknown((.|\n)*):UserWarning:cuml[.*]")
+@pytest.mark.filterwarnings("ignore:Transformers((.|\n)*):UserWarning:cuml[.*]")
 def test_unfit_clone(model_name):
     if model_name in unfit_clone_xfail:
         pytest.xfail()
@@ -578,7 +578,7 @@ def assert_model(pickled_model, X):
         result["fit_model"] = pickled_model.fit(X)
         result["data"] = X
         result["trust"] = trustworthiness(
-            X, pickled_model.embedding_, 10)
+            X, pickled_model.embedding_, n_neighbors=10)
 
     def create_mod_2():
         model = result["fit_model"]
@@ -586,7 +586,7 @@ def create_mod_2():
 
     def assert_second_model(pickled_model, X):
         trust_after = trustworthiness(
-            X, pickled_model.embedding_, 10)
+            X, pickled_model.embedding_, n_neighbors=10)
         assert result["trust"] == trust_after
 
     pickle_save_load(tmpdir, create_mod, assert_model)
@@ -708,6 +708,7 @@ def assert_model(pickled_model, X):
 @pytest.mark.parametrize('nrows', [unit_param(100)])
 @pytest.mark.parametrize('ncols', [unit_param(20)])
 @pytest.mark.parametrize('n_info', [unit_param(10)])
+@pytest.mark.filterwarnings("ignore:((.|\n)*)n_streams((.|\n)*):UserWarning:cuml[.*]")
 def test_small_rf(tmpdir, key, datatype, nrows, ncols, n_info):
 
     result = {}
diff --git a/python/cuml/test/test_preprocessing.py b/python/cuml/test/test_preprocessing.py
index 59021e8916..6529a3c16c 100644
--- a/python/cuml/test/test_preprocessing.py
+++ b/python/cuml/test/test_preprocessing.py
@@ -65,6 +65,7 @@
 
 import numpy as np
 import cupy as cp
+import cupyx as cpx
 import scipy
 
 
@@ -139,12 +140,12 @@ def test_standard_scaler_sparse(failure_logger,
     r_X = scaler.inverse_transform(t_X)
     #  assert type(t_X) == type(X)
     #  assert type(r_X) == type(t_X)
-    if cupyx.scipy.sparse.issparse(X):
-        assert cupyx.scipy.sparse.issparse(t_X)
+    if cpx.scipy.sparse.issparse(X):
+        assert cpx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
-    if cupyx.scipy.sparse.issparse(t_X):
-        assert cupyx.scipy.sparse.issparse(r_X)
+    if cpx.scipy.sparse.issparse(t_X):
+        assert cpx.scipy.sparse.issparse(r_X)
     if scipy.sparse.issparse(t_X):
         assert scipy.sparse.issparse(r_X)
 
@@ -180,8 +181,8 @@ def test_scale_sparse(failure_logger, sparse_clf_dataset,  # noqa: F811
 
     t_X = cu_scale(X, with_mean=False, with_std=with_std, copy=True)
     #  assert type(t_X) == type(X)
-    if cupyx.scipy.sparse.issparse(X):
-        assert cupyx.scipy.sparse.issparse(t_X)
+    if cpx.scipy.sparse.issparse(X):
+        assert cpx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
 
@@ -230,12 +231,12 @@ def test_maxabs_scaler_sparse(failure_logger,
     r_X = scaler.inverse_transform(t_X)
     #  assert type(t_X) == type(X)
     #  assert type(r_X) == type(t_X)
-    if cupyx.scipy.sparse.issparse(X):
-        assert cupyx.scipy.sparse.issparse(t_X)
+    if cpx.scipy.sparse.issparse(X):
+        assert cpx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
-    if cupyx.scipy.sparse.issparse(t_X):
-        assert cupyx.scipy.sparse.issparse(r_X)
+    if cpx.scipy.sparse.issparse(t_X):
+        assert cpx.scipy.sparse.issparse(r_X)
     if scipy.sparse.issparse(t_X):
         assert scipy.sparse.issparse(r_X)
 
@@ -272,8 +273,8 @@ def test_normalizer_sparse(failure_logger, sparse_clf_dataset,  # noqa: F811
     normalizer = cuNormalizer(norm=norm, copy=True)
     t_X = normalizer.fit_transform(X)
     #  assert type(t_X) == type(X)
-    if cupyx.scipy.sparse.issparse(X):
-        assert cupyx.scipy.sparse.issparse(t_X)
+    if cpx.scipy.sparse.issparse(X):
+        assert cpx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
 
@@ -314,8 +315,8 @@ def test_normalize_sparse(failure_logger, sparse_clf_dataset,  # noqa: F811
 
     t_X = cu_normalize(X, axis=axis, norm=norm)
     #  assert type(t_X) == type(X)
-    if cupyx.scipy.sparse.issparse(X):
-        assert cupyx.scipy.sparse.issparse(t_X)
+    if cpx.scipy.sparse.issparse(X):
+        assert cpx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
 
@@ -369,8 +370,8 @@ def test_imputer_sparse(sparse_imputer_dataset,  # noqa: F811
                               strategy=strategy, fill_value=fill_value)
     t_X = imputer.fit_transform(X)
     #  assert type(t_X) == type(X)
-    if cupyx.scipy.sparse.issparse(X):
-        assert cupyx.scipy.sparse.issparse(t_X)
+    if cpx.scipy.sparse.issparse(X):
+        assert cpx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
 
@@ -423,8 +424,8 @@ def test_poly_features_sparse(failure_logger, sparse_clf_dataset,  # noqa: F811
                                         include_bias=include_bias)
     t_X = polyfeatures.fit_transform(X)
     #  assert type(t_X) == type(X)
-    if cupyx.scipy.sparse.issparse(X):
-        assert cupyx.scipy.sparse.issparse(t_X)
+    if cpx.scipy.sparse.issparse(X):
+        assert cpx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
 
@@ -455,8 +456,8 @@ def test_add_dummy_feature_sparse(failure_logger,
 
     t_X = cu_add_dummy_feature(X, value=value)
     #  assert type(t_X) == type(X)
-    if cupyx.scipy.sparse.issparse(X):
-        assert cupyx.scipy.sparse.issparse(t_X)
+    if cpx.scipy.sparse.issparse(X):
+        assert cpx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
 
@@ -483,8 +484,8 @@ def test_binarize_sparse(failure_logger, sparse_clf_dataset,  # noqa: F811
 
     t_X = cu_binarize(X, threshold=threshold, copy=True)
     #  assert type(t_X) == type(X)
-    if cupyx.scipy.sparse.issparse(X):
-        assert cupyx.scipy.sparse.issparse(t_X)
+    if cpx.scipy.sparse.issparse(X):
+        assert cpx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
 
@@ -515,8 +516,8 @@ def test_binarizer_sparse(failure_logger, sparse_clf_dataset,  # noqa: F811
     binarizer = cuBinarizer(threshold=threshold, copy=True)
     t_X = binarizer.fit_transform(X)
     #  assert type(t_X) == type(X)
-    if cupyx.scipy.sparse.issparse(X):
-        assert cupyx.scipy.sparse.issparse(t_X)
+    if cpx.scipy.sparse.issparse(X):
+        assert cpx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
 
@@ -570,12 +571,12 @@ def test_robust_scaler_sparse(failure_logger, sparse_clf_dataset,  # noqa: F811
     r_X = scaler.inverse_transform(t_X)
     #  assert type(t_X) == type(X)
     #  assert type(r_X) == type(t_X)
-    if cupyx.scipy.sparse.issparse(X):
-        assert cupyx.scipy.sparse.issparse(t_X)
+    if cpx.scipy.sparse.issparse(X):
+        assert cpx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
-    if cupyx.scipy.sparse.issparse(t_X):
-        assert cupyx.scipy.sparse.issparse(r_X)
+    if cpx.scipy.sparse.issparse(t_X):
+        assert cpx.scipy.sparse.issparse(r_X)
     if scipy.sparse.issparse(t_X):
         assert scipy.sparse.issparse(r_X)
 
@@ -632,8 +633,8 @@ def test_robust_scale_sparse(failure_logger, sparse_clf_dataset,  # noqa: F811
                           quantile_range=quantile_range,
                           copy=True)
     #  assert type(t_X) == type(X)
-    if cupyx.scipy.sparse.issparse(X):
-        assert cupyx.scipy.sparse.issparse(t_X)
+    if cpx.scipy.sparse.issparse(X):
+        assert cpx.scipy.sparse.issparse(t_X)
     if scipy.sparse.issparse(X):
         assert scipy.sparse.issparse(t_X)
 
@@ -728,11 +729,11 @@ def test_missing_indicator_sparse(failure_logger,
                                    missing_values=1)
     ft_X = indicator.fit_transform(X)
     # assert type(ft_X) == type(X)
-    assert cupyx.scipy.sparse.issparse(ft_X) or scipy.sparse.issparse(ft_X)
+    assert cpx.scipy.sparse.issparse(ft_X) or scipy.sparse.issparse(ft_X)
     indicator.fit(X)
     t_X = indicator.transform(X)
     # assert type(t_X) == type(X)
-    assert cupyx.scipy.sparse.issparse(t_X) or scipy.sparse.issparse(t_X)
+    assert cpx.scipy.sparse.issparse(t_X) or scipy.sparse.issparse(t_X)
 
     indicator = skMissingIndicator(features=features,
                                    missing_values=1)
@@ -769,8 +770,8 @@ def test_function_transformer_sparse(sparse_clf_dataset):  # noqa: F811
                                         accept_sparse=True)
     t_X = transformer.fit_transform(X)
     r_X = transformer.inverse_transform(t_X)
-    assert cupyx.scipy.sparse.issparse(t_X) or scipy.sparse.issparse(t_X)
-    assert cupyx.scipy.sparse.issparse(r_X) or scipy.sparse.issparse(r_X)
+    assert cpx.scipy.sparse.issparse(t_X) or scipy.sparse.issparse(t_X)
+    assert cpx.scipy.sparse.issparse(r_X) or scipy.sparse.issparse(r_X)
 
     transformer = skFunctionTransformer(func=lambda x: x * 2,
                                         inverse_func=lambda x: x / 2,
diff --git a/python/cuml/thirdparty_adapters/sparsefuncs_fast.py b/python/cuml/thirdparty_adapters/sparsefuncs_fast.py
index 01cc091f25..3e10c47c6f 100644
--- a/python/cuml/thirdparty_adapters/sparsefuncs_fast.py
+++ b/python/cuml/thirdparty_adapters/sparsefuncs_fast.py
@@ -16,6 +16,7 @@
 
 
 import cupy as cp
+import cupyx as cpx
 from numba import cuda
 from math import ceil
 
@@ -356,6 +357,6 @@ def csr_polynomial_expansion(X, interaction_only, degree):
                                 d, interaction_only, degree,
                                 expanded_indptr)
 
-    return cupyx.scipy.sparse.csr_matrix((expanded_data, expanded_indices,
+    return cpx.scipy.sparse.csr_matrix((expanded_data, expanded_indices,
                                  expanded_indptr),
                                 shape=(num_rows, expanded_dimensionality))

From 754197a15da43cffc199bdc7b4797a5bf975f762 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Fri, 10 Sep 2021 12:36:29 -0500
Subject: [PATCH 09/18] FIX Multiple pytest warning fixes

---
 python/cuml/_thirdparty/sklearn/preprocessing/_data.py         | 2 +-
 .../cuml/_thirdparty/sklearn/preprocessing/_discretization.py  | 2 +-
 python/cuml/_thirdparty/sklearn/preprocessing/_imputation.py   | 2 +-
 python/cuml/test/test_preprocessing.py                         | 3 +++
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_data.py b/python/cuml/_thirdparty/sklearn/preprocessing/_data.py
index ca42e28a69..a6afb54e2a 100644
--- a/python/cuml/_thirdparty/sklearn/preprocessing/_data.py
+++ b/python/cuml/_thirdparty/sklearn/preprocessing/_data.py
@@ -1238,7 +1238,7 @@ def fit(self, X, y=None) -> "RobustScaler":
                 else:
                     column_data = X[:, feature_idx]
 
-                is_not_nan = ~np.isnan(column_data).astype(np.bool)
+                is_not_nan = ~np.isnan(column_data).astype(bool)
                 column_data = column_data[is_not_nan]
                 quantiles.append(np.percentile(column_data,
                                                self.quantile_range))
diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py b/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py
index 0e35e76619..87ad8a7f62 100644
--- a/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py
+++ b/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py
@@ -258,7 +258,7 @@ def _validate_n_bins(self, n_features):
                 raise ValueError("{} received an invalid number "
                                  "of bins. Received {}, expected at least 2."
                                  .format(KBinsDiscretizer.__name__, orig_bins))
-            return np.full(n_features, orig_bins, dtype=np.int)
+            return np.full(n_features, orig_bins, dtype=int)
 
         n_bins = check_array(orig_bins, dtype=np.int, copy=True,
                              ensure_2d=False)
diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_imputation.py b/python/cuml/_thirdparty/sklearn/preprocessing/_imputation.py
index b8ad6d0f7a..7087cb146e 100644
--- a/python/cuml/_thirdparty/sklearn/preprocessing/_imputation.py
+++ b/python/cuml/_thirdparty/sklearn/preprocessing/_imputation.py
@@ -447,7 +447,7 @@ def transform(self, X) -> SparseCumlArray:
             else:
                 mask = _get_mask(X.data, self.missing_values)
                 indexes = np.repeat(
-                    np.arange(len(X.indptr) - 1, dtype=np.int),
+                    np.arange(len(X.indptr) - 1, dtype=int),
                     np.diff(X.indptr).tolist())[mask]
 
                 X.data[mask] = valid_statistics[indexes].astype(X.dtype,
diff --git a/python/cuml/test/test_preprocessing.py b/python/cuml/test/test_preprocessing.py
index 6529a3c16c..2258bf48d2 100644
--- a/python/cuml/test/test_preprocessing.py
+++ b/python/cuml/test/test_preprocessing.py
@@ -160,6 +160,9 @@ def test_standard_scaler_sparse(failure_logger,
 @pytest.mark.parametrize("axis", [0, 1])
 @pytest.mark.parametrize("with_mean", [True, False])
 @pytest.mark.parametrize("with_std", [True, False])
+# FIXME: ignore warnings from cuml and sklearn about scaling issues
+# issue: https://github.com/rapidsai/cuml/issues/4203
+@pytest.mark.filterwarnings("ignore:Numerical issues::")
 def test_scale(failure_logger, clf_dataset, axis,  # noqa: F811
                with_mean, with_std):
     X_np, X = clf_dataset

From f94f622cc7a9d422413110d8e7e21e42ec935e40 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Fri, 10 Sep 2021 12:47:47 -0500
Subject: [PATCH 10/18] FIX Multiple pytest warning fixes

---
 python/cuml/model_selection/_split.py  | 2 +-
 python/cuml/test/test_preprocessing.py | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/cuml/model_selection/_split.py b/python/cuml/model_selection/_split.py
index d1fec1f569..e95f0d2072 100644
--- a/python/cuml/model_selection/_split.py
+++ b/python/cuml/model_selection/_split.py
@@ -222,7 +222,7 @@ def _approximate_mode(class_counts, n_draws, rng):
             need_to_add -= add_now
             if need_to_add == 0:
                 break
-    return floored.astype(cp.int)
+    return floored.astype(int)
 
 
 def train_test_split(X,
diff --git a/python/cuml/test/test_preprocessing.py b/python/cuml/test/test_preprocessing.py
index 2258bf48d2..e7da960eaa 100644
--- a/python/cuml/test/test_preprocessing.py
+++ b/python/cuml/test/test_preprocessing.py
@@ -751,13 +751,17 @@ def test_missing_indicator_sparse(failure_logger,
 def test_function_transformer(clf_dataset):  # noqa: F811
     X_np, X = clf_dataset
 
-    transformer = cuFunctionTransformer(func=cp.exp, inverse_func=cp.log)
+    transformer = cuFunctionTransformer(func=cp.exp,
+                                        inverse_func=cp.log,
+                                        check_inverse=False)
     t_X = transformer.fit_transform(X)
     r_X = transformer.inverse_transform(t_X)
     assert type(t_X) == type(X)
     assert type(r_X) == type(t_X)
 
-    transformer = skFunctionTransformer(func=np.exp, inverse_func=np.log)
+    transformer = skFunctionTransformer(func=np.exp,
+                                        inverse_func=np.log,
+                                        check_inverse=False)
     sk_t_X = transformer.fit_transform(X_np)
     sk_r_X = transformer.inverse_transform(sk_t_X)
 

From fa3f522f72d3febcf5e2a9ef028e2e3de184394d Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Fri, 10 Sep 2021 12:59:58 -0500
Subject: [PATCH 11/18] FIX Multiple pytest warning fixes

---
 python/cuml/preprocessing/text/stem/porter_stemmer.py | 2 +-
 python/cuml/test/test_random_forest.py                | 6 +++++-
 python/cuml/test/test_stationarity.py                 | 2 +-
 python/cuml/tsa/seasonality.pyx                       | 2 +-
 python/cuml/tsa/stationarity.pyx                      | 2 +-
 5 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/python/cuml/preprocessing/text/stem/porter_stemmer.py b/python/cuml/preprocessing/text/stem/porter_stemmer.py
index 2b507bb8e3..49dcd3e252 100644
--- a/python/cuml/preprocessing/text/stem/porter_stemmer.py
+++ b/python/cuml/preprocessing/text/stem/porter_stemmer.py
@@ -734,7 +734,7 @@ def get_condition_flag(word_str_ser, condition):
         return a bool series where flag is valid
     """
     if condition is None:
-        return cudf.Series(cp.ones(len(word_str_ser), np.bool))
+        return cudf.Series(cp.ones(len(word_str_ser), bool))
     else:
         return condition(word_str_ser)
 
diff --git a/python/cuml/test/test_random_forest.py b/python/cuml/test/test_random_forest.py
index 46e7d572cd..e430fc1dd1 100644
--- a/python/cuml/test/test_random_forest.py
+++ b/python/cuml/test/test_random_forest.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import pytest
+pytestmark = pytest.mark.filterwarnings("ignore: For reproducible results(.*)::cuml[.*]")
+
 import cudf
 import numpy as np
-import pytest
 import random
 import json
 import io
@@ -365,6 +367,7 @@ def test_rf_classification_seed(small_clf, datatype):
     "datatype", [(np.float64, np.float32), (np.float32, np.float64)]
 )
 @pytest.mark.parametrize("convert_dtype", [True, False])
+@pytest.mark.filterwarnings("ignore:To use pickling(.*)::cuml[.*]")
 def test_rf_classification_float64(small_clf, datatype, convert_dtype):
 
     X, y = small_clf
@@ -410,6 +413,7 @@ def test_rf_classification_float64(small_clf, datatype, convert_dtype):
 @pytest.mark.parametrize(
     "datatype", [(np.float64, np.float32), (np.float32, np.float64)]
 )
+@pytest.mark.filterwarnings("ignore:To use pickling(.*)::cuml[.*]")
 def test_rf_regression_float64(large_reg, datatype):
 
     X, y = large_reg
diff --git a/python/cuml/test/test_stationarity.py b/python/cuml/test/test_stationarity.py
index 34e9bc8144..78dffc984f 100644
--- a/python/cuml/test/test_stationarity.py
+++ b/python/cuml/test/test_stationarity.py
@@ -45,7 +45,7 @@ def kpss_ref(y):
     """Wrapper around statsmodels' KPSS test
     """
     batch_size = y.shape[1]
-    test_results = np.zeros(batch_size, dtype=np.bool)
+    test_results = np.zeros(batch_size, dtype=bool)
     for i in range(batch_size):
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore")
diff --git a/python/cuml/tsa/seasonality.pyx b/python/cuml/tsa/seasonality.pyx
index 318375ffcc..bdd7d3008f 100644
--- a/python/cuml/tsa/seasonality.pyx
+++ b/python/cuml/tsa/seasonality.pyx
@@ -84,5 +84,5 @@ def seas_test(y, s, handle=None) -> CumlArray:
 
     # Temporary: Python implementation
     python_res = python_seas_test(h_y, batch_size, n_obs, s)
-    d_res, *_ = input_to_cuml_array(np.array(python_res), check_dtype=np.bool)
+    d_res, *_ = input_to_cuml_array(np.array(python_res), check_dtype=bool)
     return d_res
diff --git a/python/cuml/tsa/stationarity.pyx b/python/cuml/tsa/stationarity.pyx
index 42397fe70e..85ba5a9a4e 100644
--- a/python/cuml/tsa/stationarity.pyx
+++ b/python/cuml/tsa/stationarity.pyx
@@ -89,7 +89,7 @@ def kpss_test(y, d=0, D=0, s=0, pval_threshold=0.05,
         handle = Handle()
     cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()
 
-    results = CumlArray.empty(batch_size, dtype=np.bool)
+    results = CumlArray.empty(batch_size, dtype=bool)
     cdef uintptr_t d_results = results.ptr
 
     # Call C++ function

From 1919ba8e3520a9fe398ec3cded1addcda954e951 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Fri, 10 Sep 2021 14:01:52 -0500
Subject: [PATCH 12/18] FIX Multiple pytest warning fixes

---
 python/cuml/manifold/t_sne.pyx                |  9 ++++---
 .../preprocessing/text/stem/porter_stemmer.py |  2 +-
 python/cuml/test/test_tfidf.py                |  1 +
 python/cuml/test/test_thirdparty.py           | 15 ++++++++---
 python/cuml/test/test_tsne.py                 |  4 ++-
 python/cuml/test/test_umap.py                 | 26 +++++++++++--------
 python/cuml/tsa/seasonality.pyx               |  3 ---
 python/cuml/tsa/stationarity.pyx              | 10 +++----
 8 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx
index 03aa7d8c6b..7a88498038 100644
--- a/python/cuml/manifold/t_sne.pyx
+++ b/python/cuml/manifold/t_sne.pyx
@@ -272,10 +272,11 @@ class TSNE(Base,
         if n_components < 0:
             raise ValueError("n_components = {} should be more "
                              "than 0.".format(n_components))
-        if n_components != 2 and (method == 'barnes_hut' or method == 'fft'):
-            warnings.warn("Barnes Hut and FFT only work when "
-                          "n_components == 2. Switching to exact.")
-            method = 'exact'
+        # Enable warning once n_components >= 2 is supported.
+        # if n_components != 2 and (method == 'barnes_hut' or method == 'fft'):
+        #     warnings.warn("Barnes Hut and FFT only work when "
+        #                   "n_components == 2. Switching to exact.")
+        #     method = 'exact'
         if n_components != 2:
             raise ValueError("Currently TSNE supports n_components = 2; "
                              "but got n_components = {}".format(n_components))
diff --git a/python/cuml/preprocessing/text/stem/porter_stemmer.py b/python/cuml/preprocessing/text/stem/porter_stemmer.py
index 49dcd3e252..a0ae1b77ca 100644
--- a/python/cuml/preprocessing/text/stem/porter_stemmer.py
+++ b/python/cuml/preprocessing/text/stem/porter_stemmer.py
@@ -812,5 +812,5 @@ def build_can_replace_mask(len_mask, mask):
       if mask is None else returns mask
     """
     if mask is None:
-        mask = cudf.Series(cp.ones(len_mask, dtype=cp.bool))
+        mask = cudf.Series(cp.ones(len_mask, dtype=bool))
     return mask
diff --git a/python/cuml/test/test_tfidf.py b/python/cuml/test/test_tfidf.py
index fd0364767d..194a84f81c 100644
--- a/python/cuml/test/test_tfidf.py
+++ b/python/cuml/test/test_tfidf.py
@@ -54,6 +54,7 @@
 @pytest.mark.parametrize('use_idf', [True, False])
 @pytest.mark.parametrize('smooth_idf', [True, False])
 @pytest.mark.parametrize('sublinear_tf', [True, False])
+@pytest.mark.filterwarnings("ignore:divide by zero(.*):RuntimeWarning:sklearn[.*]")
 def test_tfidf_transformer(data, norm, use_idf, smooth_idf, sublinear_tf):
     data_gpu = cp.array(data)
 
diff --git a/python/cuml/test/test_thirdparty.py b/python/cuml/test/test_thirdparty.py
index 24eeafa958..a6d4547858 100644
--- a/python/cuml/test/test_thirdparty.py
+++ b/python/cuml/test/test_thirdparty.py
@@ -17,6 +17,7 @@
 
 import numpy as np
 import cupy as cp
+import cupyx as cpx
 from cuml._thirdparty.sklearn.utils.validation import check_X_y
 
 from cuml._thirdparty.sklearn.utils.extmath import row_norms as cu_row_norms, \
@@ -55,9 +56,9 @@ def sparse_random_dataset(request, random_seed):
     random_loc = cp.random.choice(X.size, int(X.size * 0.3), replace=False)
     X.ravel()[random_loc] = 0
     if request.param == 'cupy-csr':
-        X_sparse = cupyx.scipy.sparse.csr_matrix(X)
+        X_sparse = cpx.scipy.sparse.csr_matrix(X)
     elif request.param == 'cupy-csc':
-        X_sparse = cupyx.scipy.sparse.csc_matrix(X)
+        X_sparse = cpx.scipy.sparse.csc_matrix(X)
     return X.get(), X, X_sparse.get(), X_sparse
 
 
@@ -157,6 +158,8 @@ def test_mean_variance_axis(failure_logger, sparse_random_dataset, axis):
 
 @pytest.mark.parametrize("axis", [None, 0, 1])
 @pytest.mark.parametrize("ignore_nan", [False, True])
+# ignore warning about changing sparsity in both cupy and scipy
+@pytest.mark.filterwarnings("ignore:(.*)expensive(.*)::")
 def test_min_max_axis(failure_logger, sparse_random_dataset, axis, ignore_nan):
     _, X, X_sparse_np, X_sparse = sparse_random_dataset
     X_sparse[0, 0] = np.nan
@@ -191,14 +194,18 @@ def sparse_extremes(request, random_seed):
        [0.0, 0.0, cp.nan],
        [0.0, cp.nan, cp.nan]])
     if request.param == 'cupy-csr':
-        X_sparse = cupyx.scipy.sparse.csr_matrix(X)
+        X_sparse = cpx.scipy.sparse.csr_matrix(X)
     elif request.param == 'cupy-csc':
-        X_sparse = cupyx.scipy.sparse.csc_matrix(X)
+        X_sparse = cpx.scipy.sparse.csc_matrix(X)
     return X_sparse.get(), X_sparse
 
 
 @pytest.mark.parametrize("axis", [None, 0, 1])
 @pytest.mark.parametrize("ignore_nan", [False, True])
+# ignore warning about changing sparsity in both cupy and scipy
+@pytest.mark.filterwarnings("ignore:(.*)expensive(.*)::")
+# ignore warning about all nan row in sparse_extremes
+@pytest.mark.filterwarnings("ignore:All-NaN(.*)::")
 def test_min_max_axis_extremes(sparse_extremes, axis, ignore_nan):
     X_sparse_np, X_sparse = sparse_extremes
 
diff --git a/python/cuml/test/test_tsne.py b/python/cuml/test/test_tsne.py
index 8158543666..5f5d2bf58c 100644
--- a/python/cuml/test/test_tsne.py
+++ b/python/cuml/test/test_tsne.py
@@ -13,8 +13,10 @@
 # limitations under the License.
 #
 
-import numpy as np
 import pytest
+pytestmark = pytest.mark.filterwarnings("ignore:Method 'fft' is experimental::")
+
+import numpy as np
 import scipy
 import cupyx
 
diff --git a/python/cuml/test/test_umap.py b/python/cuml/test/test_umap.py
index 28b84e9375..f706046282 100644
--- a/python/cuml/test/test_umap.py
+++ b/python/cuml/test/test_umap.py
@@ -96,7 +96,7 @@ def test_supervised_umap_trustworthiness_on_iris():
     embedding = cuUMAP(n_neighbors=10, random_state=0,
                        min_dist=0.01).fit_transform(
         data, iris.target, convert_dtype=True)
-    trust = trustworthiness(iris.data, embedding, 10)
+    trust = trustworthiness(iris.data, embedding, n_neighbors=10)
     assert trust >= 0.97
 
 
@@ -109,7 +109,7 @@ def test_semisupervised_umap_trustworthiness_on_iris():
                        min_dist=0.01).fit_transform(
         data, target, convert_dtype=True)
 
-    trust = trustworthiness(iris.data, embedding, 10)
+    trust = trustworthiness(iris.data, embedding, n_neighbors=10)
     assert trust >= 0.97
 
 
@@ -119,7 +119,7 @@ def test_umap_trustworthiness_on_iris():
     embedding = cuUMAP(n_neighbors=10, min_dist=0.01,
                        random_state=0).fit_transform(
         data, convert_dtype=True)
-    trust = trustworthiness(iris.data, embedding, 10)
+    trust = trustworthiness(iris.data, embedding, n_neighbors=10)
     assert trust >= 0.97
 
 
@@ -140,7 +140,7 @@ def test_umap_transform_on_iris(target_metric):
 
     assert not np.isnan(embedding).any()
 
-    trust = trustworthiness(new_data, embedding, 10)
+    trust = trustworthiness(new_data, embedding, n_neighbors=10)
     assert trust >= 0.85
 
 
@@ -183,7 +183,8 @@ def test_umap_transform_on_digits_sparse(target_metric, input_type,
     if input_type == 'cupy':
         embedding = embedding.get()
 
-    trust = trustworthiness(digits.data[~digits_selection], embedding, 15)
+    trust = trustworthiness(digits.data[~digits_selection], embedding,
+                            n_neighbors=15)
     assert trust >= 0.96
 
 
@@ -208,7 +209,8 @@ def test_umap_transform_on_digits(target_metric):
     new_data = digits.data[~digits_selection]
 
     embedding = fitter.transform(new_data, convert_dtype=True)
-    trust = trustworthiness(digits.data[~digits_selection], embedding, 15)
+    trust = trustworthiness(digits.data[~digits_selection], embedding,
+                            n_neighbors=15)
     assert trust >= 0.96
 
 
@@ -241,8 +243,8 @@ def test_umap_fit_transform_trust(name, target_metric):
     embedding = model.fit_transform(data)
     cuml_embedding = cuml_model.fit_transform(data, convert_dtype=True)
 
-    trust = trustworthiness(data, embedding, 10)
-    cuml_trust = trustworthiness(data, cuml_embedding, 10)
+    trust = trustworthiness(data, embedding, n_neighbors=10)
+    cuml_trust = trustworthiness(data, cuml_embedding, n_neighbors=10)
 
     assert array_equal(trust, cuml_trust, 1e-1, with_sign=True)
 
@@ -278,6 +280,7 @@ def test_umap_data_formats(input_type, should_downcast,
 
 
 @pytest.mark.parametrize('target_metric', ["categorical", "euclidean"])
+@pytest.mark.filterwarnings("ignore:(.*)connected(.*):UserWarning:sklearn[.*]")
 def test_umap_fit_transform_score_default(target_metric):
 
     n_samples = 500
@@ -434,7 +437,7 @@ def test_umap_fit_transform_trustworthiness_with_consistency_enabled():
     algo = cuUMAP(n_neighbors=10, min_dist=0.01, init="random",
                   random_state=42)
     embedding = algo.fit_transform(data, convert_dtype=True)
-    trust = trustworthiness(iris.data, embedding, 10)
+    trust = trustworthiness(iris.data, embedding, n_neighbors=10)
     assert trust >= 0.97
 
 
@@ -449,10 +452,11 @@ def test_umap_transform_trustworthiness_with_consistency_enabled():
                    random_state=42)
     model.fit(fit_data, convert_dtype=True)
     embedding = model.transform(transform_data, convert_dtype=True)
-    trust = trustworthiness(transform_data, embedding, 10)
+    trust = trustworthiness(transform_data, embedding, n_neighbors=10)
     assert trust >= 0.92
 
 
+@pytest.mark.filterwarnings("ignore:(.*)zero(.*)::scipy[.*]|umap[.*]")
 def test_exp_decay_params():
     def compare_exp_decay_params(a=None, b=None, min_dist=0.1, spread=1.0):
         cuml_model = cuUMAP(a=a, b=b, min_dist=min_dist, spread=spread)
@@ -494,7 +498,7 @@ def transform_embed(knn_graph=None):
                                convert_dtype=True)
 
     def test_trustworthiness(embedding):
-        trust = trustworthiness(data, embedding, n_neighbors)
+        trust = trustworthiness(data, embedding, n_neighbors=n_neighbors)
         assert trust >= 0.92
 
     def test_equality(e1, e2):
diff --git a/python/cuml/tsa/seasonality.pyx b/python/cuml/tsa/seasonality.pyx
index bdd7d3008f..786306114c 100644
--- a/python/cuml/tsa/seasonality.pyx
+++ b/python/cuml/tsa/seasonality.pyx
@@ -15,10 +15,7 @@
 
 # distutils: language = c++
 
-import ctypes
 import numpy as np
-from libc.stdint cimport uintptr_t
-from libcpp cimport bool
 
 import cuml.internals
 from cuml.common.array import CumlArray
diff --git a/python/cuml/tsa/stationarity.pyx b/python/cuml/tsa/stationarity.pyx
index 85ba5a9a4e..8db12b94e8 100644
--- a/python/cuml/tsa/stationarity.pyx
+++ b/python/cuml/tsa/stationarity.pyx
@@ -18,7 +18,7 @@
 import ctypes
 import numpy as np
 from libc.stdint cimport uintptr_t
-from libcpp cimport bool
+from libcpp cimport bool as boolcpp
 
 import cuml.internals
 from cuml.common.array import CumlArray
@@ -31,7 +31,7 @@ cdef extern from "cuml/tsa/stationarity.h" namespace "ML":
     int cpp_kpss "ML::Stationarity::kpss_test" (
         const handle_t& handle,
         const float* d_y,
-        bool* results,
+        boolcpp* results,
         int batch_size,
         int n_obs,
         int d, int D, int s,
@@ -40,7 +40,7 @@ cdef extern from "cuml/tsa/stationarity.h" namespace "ML":
     int cpp_kpss "ML::Stationarity::kpss_test" (
         const handle_t& handle,
         const double* d_y,
-        bool* results,
+        boolcpp* results,
         int batch_size,
         int n_obs,
         int d, int D, int s,
@@ -96,7 +96,7 @@ def kpss_test(y, d=0, D=0, s=0, pval_threshold=0.05,
     if dtype == np.float32:
         cpp_kpss(handle_[0],
                  <float*> d_y_ptr,
-                 <bool*> d_results,
+                 <boolcpp*> d_results,
                  <int> batch_size,
                  <int> n_obs,
                  <int> d, <int> D, <int> s,
@@ -104,7 +104,7 @@ def kpss_test(y, d=0, D=0, s=0, pval_threshold=0.05,
     elif dtype == np.float64:
         cpp_kpss(handle_[0],
                  <double*> d_y_ptr,
-                 <bool*> d_results,
+                 <boolcpp*> d_results,
                  <int> batch_size,
                  <int> n_obs,
                  <int> d, <int> D, <int> s,

From 9a13cedbceb9597133bccafba022dbbbaffdd55b Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Fri, 10 Sep 2021 14:40:55 -0500
Subject: [PATCH 13/18] FIX Multiple pytest warning fixes and PEP8 corrections

---
 python/cuml/dask/decomposition/pca.py                 |  2 +-
 python/cuml/dask/decomposition/tsvd.py                |  4 ++--
 python/cuml/dask/ensemble/base.py                     |  2 +-
 python/cuml/dask/manifold/umap.py                     |  2 +-
 python/cuml/preprocessing/text/stem/porter_stemmer.py |  1 -
 python/cuml/test/dask/test_linear_regression.py       |  4 ++--
 python/cuml/test/dask/test_ridge_regression.py        |  4 ++--
 python/cuml/test/dask/test_tfidf.py                   |  2 ++
 python/cuml/test/dask/test_umap.py                    |  2 +-
 python/cuml/test/test_nearest_neighbors.py            | 10 ++++++----
 python/cuml/test/test_one_hot_encoder.py              |  6 ++++--
 python/cuml/test/test_pickle.py                       |  9 ++++++---
 python/cuml/test/test_random_forest.py                |  5 ++++-
 python/cuml/test/test_tfidf.py                        |  3 ++-
 python/cuml/test/test_tsne.py                         |  7 ++++---
 python/cuml/thirdparty_adapters/sparsefuncs_fast.py   |  5 +++--
 16 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/python/cuml/dask/decomposition/pca.py b/python/cuml/dask/decomposition/pca.py
index c47af162db..3cd9d01cd7 100644
--- a/python/cuml/dask/decomposition/pca.py
+++ b/python/cuml/dask/decomposition/pca.py
@@ -158,7 +158,7 @@ class PCA(BaseDecomposition,
 
     def __init__(self, *, client=None, verbose=False, **kwargs):
 
-        super().__init__(PCA._create_pca,
+        super().__init__(model_func=PCA._create_pca,
                          client=client,
                          verbose=verbose,
                          **kwargs)
diff --git a/python/cuml/dask/decomposition/tsvd.py b/python/cuml/dask/decomposition/tsvd.py
index bf59a74692..502fb2a0e9 100644
--- a/python/cuml/dask/decomposition/tsvd.py
+++ b/python/cuml/dask/decomposition/tsvd.py
@@ -121,8 +121,8 @@ def __init__(self, *, client=None, **kwargs):
         """
         Constructor for distributed TruncatedSVD model
         """
-        super().__init__(TruncatedSVD._create_tsvd,
-                         client,
+        super().__init__(model_func=TruncatedSVD._create_tsvd,
+                         client=client,
                          **kwargs)
 
     def fit(self, X, _transform=False):
diff --git a/python/cuml/dask/ensemble/base.py b/python/cuml/dask/ensemble/base.py
index 10dd0e2f90..761d2e6204 100644
--- a/python/cuml/dask/ensemble/base.py
+++ b/python/cuml/dask/ensemble/base.py
@@ -102,7 +102,7 @@ def _fit(self, model, dataset, convert_dtype, broadcast_data):
         self.active_workers = data.workers
         self.datatype = data.datatype
         if self.datatype == 'cudf':
-            has_float64 = (dataset[0].dtypes.any() == np.float64)
+            has_float64 = (dataset[0].dtypes == np.float64).any()
         else:
             has_float64 = (dataset[0].dtype == np.float64)
         if has_float64:
diff --git a/python/cuml/dask/manifold/umap.py b/python/cuml/dask/manifold/umap.py
index 8ea60d2364..c940f0babd 100644
--- a/python/cuml/dask/manifold/umap.py
+++ b/python/cuml/dask/manifold/umap.py
@@ -93,7 +93,7 @@ class UMAP(BaseEstimator,
 
     """
     def __init__(self, *, model, client=None, **kwargs):
-        super().__init__(client, **kwargs)
+        super().__init__(client=client, **kwargs)
 
         self._set_internal_model(model)
 
diff --git a/python/cuml/preprocessing/text/stem/porter_stemmer.py b/python/cuml/preprocessing/text/stem/porter_stemmer.py
index a0ae1b77ca..00ffa9eaac 100644
--- a/python/cuml/preprocessing/text/stem/porter_stemmer.py
+++ b/python/cuml/preprocessing/text/stem/porter_stemmer.py
@@ -15,7 +15,6 @@
 #
 
 import cudf
-import numpy as np
 import cupy as cp
 from .porter_stemmer_utils.suffix_utils import (
     get_stem_series,
diff --git a/python/cuml/test/dask/test_linear_regression.py b/python/cuml/test/dask/test_linear_regression.py
index 3dcc191b09..f1a049caae 100644
--- a/python/cuml/test/dask/test_linear_regression.py
+++ b/python/cuml/test/dask/test_linear_regression.py
@@ -73,8 +73,8 @@ def imp():
     from cuml.dask.linear_model import LinearRegression as cumlOLS_dask
 
     n_info = 5
-    nrows = np.int(nrows)
-    ncols = np.int(ncols)
+    nrows = int(nrows)
+    ncols = int(ncols)
     X, y = make_regression_dataset(datatype, nrows, ncols, n_info)
 
     X_df, y_df = _prep_training_data(client, X, y, n_parts)
diff --git a/python/cuml/test/dask/test_ridge_regression.py b/python/cuml/test/dask/test_ridge_regression.py
index ce812c1502..582e0772f1 100644
--- a/python/cuml/test/dask/test_ridge_regression.py
+++ b/python/cuml/test/dask/test_ridge_regression.py
@@ -68,8 +68,8 @@ def test_ridge(nrows, ncols, n_parts, fit_intercept,
     from cuml.dask.linear_model import Ridge as cumlRidge_dask
 
     n_info = 5
-    nrows = np.int(nrows)
-    ncols = np.int(ncols)
+    nrows = int(nrows)
+    ncols = int(ncols)
     X, y = make_regression_dataset(datatype, nrows, ncols, n_info)
 
     X_df, y_df = _prep_training_data(client, X, y, n_parts)
diff --git a/python/cuml/test/dask/test_tfidf.py b/python/cuml/test/dask/test_tfidf.py
index ae79c7426a..2c96305ff1 100644
--- a/python/cuml/test/dask/test_tfidf.py
+++ b/python/cuml/test/dask/test_tfidf.py
@@ -94,6 +94,8 @@ def create_scipy_sparse_array_from_dask_cp_sparse_array(ar):
 @pytest.mark.parametrize("use_idf", [True, False])
 @pytest.mark.parametrize("smooth_idf", [True, False])
 @pytest.mark.parametrize("sublinear_tf", [True, False])
+@pytest.mark.filterwarnings("ignore:divide by zero(.*):RuntimeWarning:"
+                            "sklearn[.*]")
 def test_tfidf_transformer(
     data, norm, use_idf, smooth_idf, sublinear_tf, client
 ):
diff --git a/python/cuml/test/dask/test_umap.py b/python/cuml/test/dask/test_umap.py
index 3c3769ce6a..42ec8be838 100644
--- a/python/cuml/test/dask/test_umap.py
+++ b/python/cuml/test/dask/test_umap.py
@@ -103,7 +103,7 @@ def _umap_mnmg_trustworthiness(local_X, local_y,
 
     local_model.fit(X_train, y=y_train)
 
-    distributed_model = MNMG_UMAP(local_model)
+    distributed_model = MNMG_UMAP(model=local_model)
     embedding = distributed_model.transform(X_transform_d)
 
     embedding = embedding.compute()
diff --git a/python/cuml/test/test_nearest_neighbors.py b/python/cuml/test/test_nearest_neighbors.py
index 3a33e1c94b..534f2e0d64 100644
--- a/python/cuml/test/test_nearest_neighbors.py
+++ b/python/cuml/test/test_nearest_neighbors.py
@@ -15,8 +15,6 @@
 #
 
 import pytest
-pytestmark = pytest.mark.filterwarnings("ignore:((.|\n)*)#4020((.|\n)*):UserWarning:cuml[.*]")
-
 import math
 
 from cuml.test.utils import array_equal, unit_param, quality_param, \
@@ -44,6 +42,10 @@
 import gc
 
 
+pytestmark = pytest.mark.filterwarnings("ignore:((.|\n)*)#4020((.|\n)*):"
+                                        "UserWarning:cuml[.*]")
+
+
 def predict(neigh_ind, _y, n_neighbors):
     import scipy.stats as stats
 
@@ -528,9 +530,9 @@ def test_nearest_neighbors_sparse(metric,
         return
 
     a = cupyx.scipy.sparse.random(nrows, ncols, format='csr', density=density,
-                         random_state=35)
+                                  random_state=35)
     b = cupyx.scipy.sparse.random(nrows, ncols, format='csr', density=density,
-                         random_state=38)
+                                  random_state=38)
 
     if metric == 'jaccard':
         a = a.astype('bool').astype('float32')
diff --git a/python/cuml/test/test_one_hot_encoder.py b/python/cuml/test/test_one_hot_encoder.py
index 6a5045504b..7b66f9d862 100644
--- a/python/cuml/test/test_one_hot_encoder.py
+++ b/python/cuml/test/test_one_hot_encoder.py
@@ -129,7 +129,8 @@ def test_onehot_categories(as_array):
 
 
 @pytest.mark.parametrize('as_array', [True, False], ids=['cupy', 'cudf'])
-@pytest.mark.filterwarnings("ignore:((.|\n)*)unknown((.|\n)*):UserWarning:cuml[.*]")
+@pytest.mark.filterwarnings("ignore:((.|\n)*)unknown((.|\n)*):UserWarning:"
+                            "cuml[.*]")
 def test_onehot_fit_handle_unknown(as_array):
     X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
     Y = DataFrame({'chars': ['c', 'b'], 'int': [0, 2]})
@@ -167,7 +168,8 @@ def test_onehot_transform_handle_unknown(as_array):
 
 
 @pytest.mark.parametrize('as_array', [True, False], ids=['cupy', 'cudf'])
-@pytest.mark.filterwarnings("ignore:((.|\n)*)unknown((.|\n)*):UserWarning:cuml[.*]")
+@pytest.mark.filterwarnings("ignore:((.|\n)*)unknown((.|\n)*):UserWarning:"
+                            "cuml[.*]")
 def test_onehot_inverse_transform_handle_unknown(as_array):
     X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]})
     Y_ohe = cp.array([[0., 0., 1., 0.],
diff --git a/python/cuml/test/test_pickle.py b/python/cuml/test/test_pickle.py
index 0f3da56346..19d49e2d2f 100644
--- a/python/cuml/test/test_pickle.py
+++ b/python/cuml/test/test_pickle.py
@@ -366,7 +366,8 @@ def assert_model(pickled_model, X_test):
 
 @pytest.mark.parametrize('model_name',
                          all_models.keys())
-@pytest.mark.filterwarnings("ignore:Transformers((.|\n)*):UserWarning:cuml[.*]")
+@pytest.mark.filterwarnings("ignore:Transformers((.|\n)*):UserWarning:"
+                            "cuml[.*]")
 def test_unfit_pickle(model_name):
     # Any model xfailed in this test cannot be used for hyperparameter sweeps
     # with dask or sklearn
@@ -383,7 +384,8 @@ def test_unfit_pickle(model_name):
 
 @pytest.mark.parametrize('model_name',
                          all_models.keys())
-@pytest.mark.filterwarnings("ignore:Transformers((.|\n)*):UserWarning:cuml[.*]")
+@pytest.mark.filterwarnings("ignore:Transformers((.|\n)*):UserWarning:"
+                            "cuml[.*]")
 def test_unfit_clone(model_name):
     if model_name in unfit_clone_xfail:
         pytest.xfail()
@@ -708,7 +710,8 @@ def assert_model(pickled_model, X):
 @pytest.mark.parametrize('nrows', [unit_param(100)])
 @pytest.mark.parametrize('ncols', [unit_param(20)])
 @pytest.mark.parametrize('n_info', [unit_param(10)])
-@pytest.mark.filterwarnings("ignore:((.|\n)*)n_streams((.|\n)*):UserWarning:cuml[.*]")
+@pytest.mark.filterwarnings("ignore:((.|\n)*)n_streams((.|\n)*):UserWarning:"
+                            "cuml[.*]")
 def test_small_rf(tmpdir, key, datatype, nrows, ncols, n_info):
 
     result = {}
diff --git a/python/cuml/test/test_random_forest.py b/python/cuml/test/test_random_forest.py
index e430fc1dd1..38fb02216a 100644
--- a/python/cuml/test/test_random_forest.py
+++ b/python/cuml/test/test_random_forest.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 #
 import pytest
-pytestmark = pytest.mark.filterwarnings("ignore: For reproducible results(.*)::cuml[.*]")
 
 import cudf
 import numpy as np
@@ -43,6 +42,10 @@
 import treelite
 
 
+pytestmark = pytest.mark.filterwarnings("ignore: For reproducible results(.*)"
+                                        "::cuml[.*]")
+
+
 @pytest.fixture(
     scope="session",
     params=[
diff --git a/python/cuml/test/test_tfidf.py b/python/cuml/test/test_tfidf.py
index 194a84f81c..42d3b87205 100644
--- a/python/cuml/test/test_tfidf.py
+++ b/python/cuml/test/test_tfidf.py
@@ -54,7 +54,8 @@
 @pytest.mark.parametrize('use_idf', [True, False])
 @pytest.mark.parametrize('smooth_idf', [True, False])
 @pytest.mark.parametrize('sublinear_tf', [True, False])
-@pytest.mark.filterwarnings("ignore:divide by zero(.*):RuntimeWarning:sklearn[.*]")
+@pytest.mark.filterwarnings("ignore:divide by zero(.*):RuntimeWarning:"
+                            "sklearn[.*]")
 def test_tfidf_transformer(data, norm, use_idf, smooth_idf, sublinear_tf):
     data_gpu = cp.array(data)
 
diff --git a/python/cuml/test/test_tsne.py b/python/cuml/test/test_tsne.py
index 5f5d2bf58c..8c6285153e 100644
--- a/python/cuml/test/test_tsne.py
+++ b/python/cuml/test/test_tsne.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 #
 
-import pytest
-pytestmark = pytest.mark.filterwarnings("ignore:Method 'fft' is experimental::")
-
 import numpy as np
+import pytest
 import scipy
 import cupyx
 
@@ -29,6 +27,9 @@
 from sklearn import datasets
 
 
+pytestmark = pytest.mark.filterwarnings("ignore:Method 'fft' is "
+                                        "experimental::")
+
 DEFAULT_N_NEIGHBORS = 90
 DEFAULT_PERPLEXITY = 30
 
diff --git a/python/cuml/thirdparty_adapters/sparsefuncs_fast.py b/python/cuml/thirdparty_adapters/sparsefuncs_fast.py
index 3e10c47c6f..ee6d1c42fa 100644
--- a/python/cuml/thirdparty_adapters/sparsefuncs_fast.py
+++ b/python/cuml/thirdparty_adapters/sparsefuncs_fast.py
@@ -358,5 +358,6 @@ def csr_polynomial_expansion(X, interaction_only, degree):
                                 expanded_indptr)
 
     return cpx.scipy.sparse.csr_matrix((expanded_data, expanded_indices,
-                                 expanded_indptr),
-                                shape=(num_rows, expanded_dimensionality))
+                                        expanded_indptr),
+                                       shape=(num_rows,
+                                              expanded_dimensionality))

From ff60a802002830366e857a4969d1368e2a620854 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Fri, 10 Sep 2021 14:42:29 -0500
Subject: [PATCH 14/18] FIX Remove gpu ci install

---
 ci/gpu/build.sh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 737acdd038..e646307cf6 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -199,10 +199,6 @@ else
     pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
     set +x
 
-    # https://docs.rapids.ai/maintainers/depmgmt/
-    gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
-    gpuci_conda_retry install -y "scikit-learn=0.24"
-
     gpuci_logger "Building cuml"
     "$WORKSPACE/build.sh" -v cuml --codecov
 

From f7ac71f5aae9375f98bc04b3dcd699b0a6b2ac97 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Fri, 10 Sep 2021 14:56:35 -0500
Subject: [PATCH 15/18] FIX Copyright years

---
 python/cuml/common/sparsefuncs.py                     | 2 +-
 python/cuml/preprocessing/text/stem/porter_stemmer.py | 2 +-
 python/cuml/test/dask/test_linear_regression.py       | 2 +-
 python/cuml/test/dask/test_ridge_regression.py        | 2 +-
 python/cuml/test/dask/test_tfidf.py                   | 2 +-
 python/cuml/test/test_holtwinters.py                  | 2 +-
 python/cuml/test/test_make_classification.py          | 2 +-
 python/cuml/test/test_stationarity.py                 | 2 +-
 python/cuml/test/test_tfidf.py                        | 2 +-
 python/cuml/thirdparty_adapters/sparsefuncs_fast.py   | 2 +-
 python/cuml/tsa/seasonality.pyx                       | 2 +-
 python/cuml/tsa/stationarity.pyx                      | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/python/cuml/common/sparsefuncs.py b/python/cuml/common/sparsefuncs.py
index b6fe00dac2..adfc54e456 100644
--- a/python/cuml/common/sparsefuncs.py
+++ b/python/cuml/common/sparsefuncs.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/cuml/preprocessing/text/stem/porter_stemmer.py b/python/cuml/preprocessing/text/stem/porter_stemmer.py
index 00ffa9eaac..5816141d88 100644
--- a/python/cuml/preprocessing/text/stem/porter_stemmer.py
+++ b/python/cuml/preprocessing/text/stem/porter_stemmer.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/cuml/test/dask/test_linear_regression.py b/python/cuml/test/dask/test_linear_regression.py
index f1a049caae..142cae7b7e 100644
--- a/python/cuml/test/dask/test_linear_regression.py
+++ b/python/cuml/test/dask/test_linear_regression.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/cuml/test/dask/test_ridge_regression.py b/python/cuml/test/dask/test_ridge_regression.py
index 582e0772f1..f5811e403e 100644
--- a/python/cuml/test/dask/test_ridge_regression.py
+++ b/python/cuml/test/dask/test_ridge_regression.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/cuml/test/dask/test_tfidf.py b/python/cuml/test/dask/test_tfidf.py
index 2c96305ff1..573976b8a5 100644
--- a/python/cuml/test/dask/test_tfidf.py
+++ b/python/cuml/test/dask/test_tfidf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/cuml/test/test_holtwinters.py b/python/cuml/test/test_holtwinters.py
index 71fba7e8bb..7e92411568 100644
--- a/python/cuml/test/test_holtwinters.py
+++ b/python/cuml/test/test_holtwinters.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/cuml/test/test_make_classification.py b/python/cuml/test/test_make_classification.py
index f715261c00..4f0fc93bfc 100644
--- a/python/cuml/test/test_make_classification.py
+++ b/python/cuml/test/test_make_classification.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-201, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/cuml/test/test_stationarity.py b/python/cuml/test/test_stationarity.py
index 78dffc984f..f606dba5d3 100644
--- a/python/cuml/test/test_stationarity.py
+++ b/python/cuml/test/test_stationarity.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/cuml/test/test_tfidf.py b/python/cuml/test/test_tfidf.py
index 42d3b87205..67a70b5ddb 100644
--- a/python/cuml/test/test_tfidf.py
+++ b/python/cuml/test/test_tfidf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/cuml/thirdparty_adapters/sparsefuncs_fast.py b/python/cuml/thirdparty_adapters/sparsefuncs_fast.py
index ee6d1c42fa..8cc043aca5 100644
--- a/python/cuml/thirdparty_adapters/sparsefuncs_fast.py
+++ b/python/cuml/thirdparty_adapters/sparsefuncs_fast.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/cuml/tsa/seasonality.pyx b/python/cuml/tsa/seasonality.pyx
index 786306114c..0a06531b67 100644
--- a/python/cuml/tsa/seasonality.pyx
+++ b/python/cuml/tsa/seasonality.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/cuml/tsa/stationarity.pyx b/python/cuml/tsa/stationarity.pyx
index 8db12b94e8..f9d662800b 100644
--- a/python/cuml/tsa/stationarity.pyx
+++ b/python/cuml/tsa/stationarity.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 36b8b255bba869fbd435012ce242936e9b6deb0b Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Fri, 10 Sep 2021 15:55:06 -0500
Subject: [PATCH 16/18] FIX Copyright years

---
 python/cuml/test/test_make_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuml/test/test_make_classification.py b/python/cuml/test/test_make_classification.py
index 4f0fc93bfc..50be5d9511 100644
--- a/python/cuml/test/test_make_classification.py
+++ b/python/cuml/test/test_make_classification.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-201, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 2c9b0497207af6ec3ecc06c0fe30a45caaa76c54 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Wed, 27 Oct 2021 11:46:30 -0500
Subject: [PATCH 17/18] Merge branch-21.12

---
 BUILD.md                                      |   5 +-
 CHANGELOG.md                                  |   4 +
 README.md                                     |   2 +-
 build.sh                                      |   7 +
 ci/checks/style.sh                            |   2 +-
 ci/cpu/build.sh                               |   2 +-
 ci/docs/build.sh                              |   2 +-
 ci/gpu/build.sh                               |   4 +-
 ci/gpu/test-notebooks.sh                      |   1 +
 ci/local/build.sh                             |   6 +
 ci/release/update-version.sh                  |   2 +
 ci/utils/nbtest.sh                            |   1 +
 conda/environments/cuml_dev_cuda11.0.yml      |  22 +-
 conda/environments/cuml_dev_cuda11.2.yml      |  22 +-
 conda/environments/cuml_dev_cuda11.4.yml      |  22 +-
 conda/recipes/cuml/meta.yaml                  |  12 +-
 conda/recipes/libcuml/meta.yaml               |   8 +-
 cpp/CMakeLists.txt                            | 166 ++--
 cpp/bench/prims/distance_common.cuh           |   2 +-
 cpp/bench/prims/fused_l2_nn.cu                |  11 +-
 cpp/bench/sg/arima_loglikelihood.cu           |   6 +-
 cpp/bench/sg/fil.cu                           |   2 +-
 cpp/cmake/thirdparty/get_raft.cmake           |  17 +-
 cpp/cmake/thirdparty/get_treelite.cmake       |   4 +-
 .../cuml/common/pinned_host_vector.hpp        |  64 ++
 cpp/include/cuml/decomposition/params.hpp     |   2 -
 cpp/include/cuml/ensemble/randomforest.hpp    |   3 +-
 cpp/include/cuml/fil/fil.h                    |   5 +-
 cpp/include/cuml/manifold/umap.hpp            |  47 +-
 cpp/include/cuml/neighbors/knn.hpp            |  11 +
 cpp/include/cuml/tree/algo_helper.h           |   3 +
 cpp/include/cuml/tree/decisiontree.hpp        |   2 +
 cpp/include/cuml/tree/flatnode.h              |  23 +-
 cpp/include/cuml/tsa/arima_common.h           |  11 +-
 cpp/include/cuml/tsa/batched_arima.hpp        |  21 +-
 cpp/include/cuml/tsa/batched_kalman.hpp       |   5 +-
 cpp/src/arima/batched_arima.cu                |  75 +-
 cpp/src/arima/batched_kalman.cu               | 708 +++++++++---------
 cpp/src/dbscan/adjgraph/naive.cuh             |   9 +-
 cpp/src/dbscan/dbscan.cuh                     |   3 +
 .../batched-levelalgo/builder.cuh             | 131 ++--
 .../batched-levelalgo/kernels.cuh             |  20 +-
 .../batched-levelalgo/metrics.cuh             | 574 ++++++++++----
 cpp/src/decisiontree/decisiontree.cu          |   4 +-
 cpp/src/decisiontree/decisiontree.cuh         | 211 ++++--
 cpp/src/fil/common.cuh                        | 191 ++++-
 cpp/src/fil/fil.cu                            | 396 +++++++---
 cpp/src/fil/infer.cu                          | 127 +---
 cpp/src/fil/internal.cuh                      | 309 ++++++--
 cpp/src/glm/ols.cuh                           |  21 +-
 cpp/src/hdbscan/detail/reachability.cuh       |   2 +-
 cpp/src/kmeans/common.cuh                     |   3 +-
 cpp/src/knn/knn.cu                            |  19 +
 cpp/src/metrics/pairwise_distance.cu          |  63 +-
 cpp/src/metrics/pairwise_distance_canberra.cu |  38 +-
 .../metrics/pairwise_distance_canberra.cuh    |   4 +-
 .../metrics/pairwise_distance_chebyshev.cu    |  26 +-
 .../metrics/pairwise_distance_chebyshev.cuh   |   4 +-
 .../metrics/pairwise_distance_correlation.cu  |  59 ++
 .../metrics/pairwise_distance_correlation.cuh |  47 ++
 cpp/src/metrics/pairwise_distance_cosine.cu   |  31 +-
 cpp/src/metrics/pairwise_distance_cosine.cuh  |   4 +-
 .../metrics/pairwise_distance_euclidean.cu    |  41 +-
 .../metrics/pairwise_distance_euclidean.cuh   |   2 +-
 cpp/src/metrics/pairwise_distance_hamming.cu  |  59 ++
 cpp/src/metrics/pairwise_distance_hamming.cuh |  47 ++
 .../metrics/pairwise_distance_hellinger.cu    |  31 +-
 .../metrics/pairwise_distance_hellinger.cuh   |   4 +-
 .../pairwise_distance_jensen_shannon.cu       |  56 ++
 .../pairwise_distance_jensen_shannon.cuh      |  47 ++
 .../pairwise_distance_kl_divergence.cu        |  55 ++
 .../pairwise_distance_kl_divergence.cuh       |  47 ++
 cpp/src/metrics/pairwise_distance_l1.cu       |  28 +-
 cpp/src/metrics/pairwise_distance_l1.cuh      |   4 +-
 .../metrics/pairwise_distance_minkowski.cu    |  30 +-
 .../metrics/pairwise_distance_minkowski.cuh   |   4 +-
 .../metrics/pairwise_distance_russell_rao.cu  |  57 ++
 .../metrics/pairwise_distance_russell_rao.cuh |  47 ++
 cpp/src/metrics/trustworthiness.cu            |   2 +-
 cpp/src/randomforest/randomforest.cu          |  40 +-
 cpp/src/randomforest/randomforest.cuh         |  71 +-
 cpp/src/solver/lars_impl.cuh                  |   1 +
 cpp/src/umap/runner.cuh                       | 166 ++++
 cpp/src/umap/umap.cu                          |  37 +
 cpp/src_prims/linalg/block.cuh                | 126 +++-
 cpp/src_prims/linalg/lstsq.cuh                | 331 +++++++-
 cpp/src_prims/matrix/grammatrix.cuh           |   2 +-
 cpp/src_prims/matrix/kernelmatrices.cuh       |   2 +-
 cpp/src_prims/metrics/scores.cuh              |   2 +-
 cpp/src_prims/metrics/silhouette_score.cuh    |   2 +-
 cpp/src_prims/selection/knn.cuh               |   2 +-
 cpp/src_prims/timeSeries/fillna.cuh           | 167 +++++
 cpp/src_prims/timeSeries/jones_transform.cuh  |  43 +-
 cpp/test/CMakeLists.txt                       |  82 +-
 cpp/test/prims/fillna.cu                      | 176 +++++
 cpp/test/prims/jones_transform.cu             |  12 +-
 cpp/test/prims/linalg_block.cu                | 128 ++++
 cpp/test/prims/trustworthiness.cu             |   2 +-
 cpp/test/sg/dbscan_test.cu                    |   2 +-
 .../sg/decisiontree_batchedlevel_unittest.cu  |   2 +-
 cpp/test/sg/fil_child_index_test.cu           | 256 +++++++
 cpp/test/sg/fil_test.cu                       | 338 +++++++--
 cpp/test/sg/hdbscan_test.cu                   |   1 -
 cpp/test/sg/rf_test.cu                        | 605 ++++++++++++++-
 cpp/test/sg/rf_treelite_test.cu               | 570 --------------
 cpp/test/sg/rproj_test.cu                     |   2 +-
 cpp/test/sg/umap_parametrizable_test.cu       |  48 +-
 docs/source/api.rst                           |   5 +-
 docs/source/conf.py                           |   4 +-
 notebooks/arima_demo.ipynb                    | 104 ++-
 python/cuml/cluster/dbscan.pyx                |   4 +
 .../dask/ensemble/randomforestclassifier.py   |  30 +-
 .../dask/ensemble/randomforestregressor.py    |  26 +-
 python/cuml/ensemble/randomforest_common.pyx  |  51 +-
 python/cuml/ensemble/randomforest_shared.pxd  |   7 +-
 .../cuml/ensemble/randomforestclassifier.pyx  |  78 +-
 .../cuml/ensemble/randomforestregressor.pyx   |  42 +-
 python/cuml/explainer/kernel_shap.pyx         |   9 +-
 python/cuml/explainer/sampling.py             |  23 +-
 .../feature_extraction/_tfidf_vectorizer.py   |  12 +-
 .../cuml/linear_model/linear_regression.pyx   |  32 +-
 python/cuml/metrics/distance_type.pxd         |   3 +
 python/cuml/metrics/pairwise_distances.pyx    |  18 +-
 python/cuml/metrics/regression.pyx            |   8 +-
 python/cuml/metrics/trustworthiness.pyx       |   3 +
 python/cuml/naive_bayes/__init__.py           |   1 +
 python/cuml/naive_bayes/naive_bayes.py        | 454 ++++++++++-
 python/cuml/neighbors/__init__.py             |   6 +-
 python/cuml/neighbors/nearest_neighbors.pyx   | 108 ++-
 python/cuml/svm/svm_base.pyx                  |   4 +-
 python/cuml/test/dask/test_random_forest.py   |  58 +-
 .../explainer/test_explainer_kernel_shap.py   |  14 +
 python/cuml/test/test_arima.py                | 249 ++++--
 python/cuml/test/test_benchmark.py            |   9 +-
 python/cuml/test/test_dbscan.py               |   9 +
 python/cuml/test/test_fil.py                  |  66 +-
 python/cuml/test/test_metrics.py              |  35 +-
 python/cuml/test/test_naive_bayes.py          | 152 +++-
 python/cuml/test/test_nearest_neighbors.py    |  66 +-
 python/cuml/test/test_random_forest.py        | 137 +++-
 .../cuml/test/test_text_feature_extraction.py |  16 +-
 python/cuml/test/test_trustworthiness.py      |   8 +
 python/cuml/test/ts_datasets/README.md        |   5 +
 .../guest_nights_by_region_missing.csv        | 280 +++++++
 .../hourly_earnings_by_industry_missing.csv   | 124 +++
 .../population_estimate_missing.csv           | 138 ++++
 python/cuml/tsa/arima.pyx                     |  72 +-
 python/cuml/tsa/auto_arima.pyx                |  19 +
 python/setup.py                               |  19 +-
 149 files changed, 7298 insertions(+), 2595 deletions(-)
 mode change 100644 => 100755 ci/local/build.sh
 create mode 100644 cpp/include/cuml/common/pinned_host_vector.hpp
 create mode 100644 cpp/src/metrics/pairwise_distance_correlation.cu
 create mode 100644 cpp/src/metrics/pairwise_distance_correlation.cuh
 create mode 100644 cpp/src/metrics/pairwise_distance_hamming.cu
 create mode 100644 cpp/src/metrics/pairwise_distance_hamming.cuh
 create mode 100644 cpp/src/metrics/pairwise_distance_jensen_shannon.cu
 create mode 100644 cpp/src/metrics/pairwise_distance_jensen_shannon.cuh
 create mode 100644 cpp/src/metrics/pairwise_distance_kl_divergence.cu
 create mode 100644 cpp/src/metrics/pairwise_distance_kl_divergence.cuh
 create mode 100644 cpp/src/metrics/pairwise_distance_russell_rao.cu
 create mode 100644 cpp/src/metrics/pairwise_distance_russell_rao.cuh
 create mode 100644 cpp/src_prims/timeSeries/fillna.cuh
 create mode 100644 cpp/test/prims/fillna.cu
 create mode 100644 cpp/test/sg/fil_child_index_test.cu
 delete mode 100644 cpp/test/sg/rf_treelite_test.cu
 create mode 100644 python/cuml/test/ts_datasets/guest_nights_by_region_missing.csv
 create mode 100644 python/cuml/test/ts_datasets/hourly_earnings_by_industry_missing.csv
 create mode 100644 python/cuml/test/ts_datasets/population_estimate_missing.csv

diff --git a/BUILD.md b/BUILD.md
index 5fed2c1a54..76ee9074c6 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -19,7 +19,8 @@ It is recommended to use conda for environment/package management. If doing so,
 
 ```bash
 conda create -n cuml_dev python=3.7
-conda env update -n cuml_dev --file=conda/environments/cuml_dev_cuda10.2.yml
+conda activate cuml_dev
+conda env update --file=conda/environments/cuml_dev_cuda11.2.yml
 ```
 
 These conda environments are based on the general RAPIDS meta packages that install common dependencies for RAPIDS projects. To install different versions of packages contained in those meta packages after creating the environment, it is recommended to remove those meta packages (without removing the actual packages contained in the environment) with the following command (having the environment active):
@@ -53,7 +54,7 @@ Other `build.sh` options:
 $ ./build.sh clean                     # remove any prior build artifacts and configuration (start over)
 $ ./build.sh libcuml -v                # build and install libcuml with verbose output
 $ ./build.sh libcuml -g                # build and install libcuml for debug
-$ PARALLEL_LEVEL=4 ./build.sh libcuml  # build and install libcuml limiting parallel build jobs to 4 (make -j4)
+$ PARALLEL_LEVEL=8 ./build.sh libcuml  # build and install libcuml limiting parallel build jobs to 8 (ninja -j8)
 $ ./build.sh libcuml -n                # build libcuml but do not install
 $ ./build.sh prims --allgpuarch        # build the ML prims tests for all supported GPU architectures
 $ ./build.sh cuml --singlegpu          # build the cuML python package without MNMG algorithms
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a7a56e9a13..9d0eda9c82 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# cuML 21.12.00 (Date TBD)
+
+Please see https://github.com/rapidsai/cuml/releases/tag/v21.12.00a for the latest changes to this development branch.
+
 # cuML 21.10.00 (Date TBD)
 
 Please see https://github.com/rapidsai/cuml/releases/tag/v21.10.00a for the latest changes to this development branch.
diff --git a/README.md b/README.md
index 23445afb1a..a3255d9d68 100644
--- a/README.md
+++ b/README.md
@@ -86,7 +86,7 @@ repo](https://github.com/rapidsai/notebooks-contrib).
 | Category | Algorithm | Notes |
 | --- | --- | --- |
 | **Clustering** |  Density-Based Spatial Clustering of Applications with Noise (DBSCAN) | Multi-node multi-GPU via Dask |
-|  | Hierarchical Density-Based Spatial Clustering of Applications with Noise (HDBSCAN)  | Experimental |
+|  | Hierarchical Density-Based Spatial Clustering of Applications with Noise (HDBSCAN)  | |
 |  | K-Means | Multi-node multi-GPU via Dask |
 |  | Single-Linkage Agglomerative Clustering | |
 | **Dimensionality Reduction** | Principal Components Analysis (PCA) | Multi-node multi-GPU via Dask|
diff --git a/build.sh b/build.sh
index b100095f8d..fb077e50ec 100755
--- a/build.sh
+++ b/build.sh
@@ -46,6 +46,7 @@ HELP="$0 [<target> ...] [<flag> ...]
    --codecov        - Enable code coverage support by compiling with Cython linetracing
                       and profiling enabled (WARNING: Impacts performance)
    --ccache         - Use ccache to cache previous compilations
+   --nocloneraft    - CMake will clone RAFT even if it is in the environment, use this flag to disable that behavior
 
  default action (no args) is to build and install 'libcuml', 'cuml', and 'prims' targets only for the detected GPU arch
 
@@ -77,6 +78,7 @@ BUILD_CUML_TESTS=ON
 BUILD_CUML_MG_TESTS=OFF
 BUILD_STATIC_FAISS=OFF
 CMAKE_LOG_LEVEL=WARNING
+DISABLE_FORCE_CLONE_RAFT=OFF
 
 # Set defaults for vars that may not have been defined externally
 #  FIXME: if INSTALL_PREFIX is not set, check PREFIX, then check
@@ -129,6 +131,7 @@ LONG_ARGUMENT_LIST=(
     "codecov"
     "ccache"
     "nolibcumltest"
+    "nocloneraft"
 )
 
 # Short arguments
@@ -188,6 +191,9 @@ while true; do
         --nolibcumltest )
             BUILD_CUML_TESTS=OFF
             ;;
+        --nocloneraft )
+            DISABLE_FORCE_CLONE_RAFT=ON
+            ;;
         --)
             shift
             break
@@ -239,6 +245,7 @@ if completeBuild || hasArg libcuml || hasArg prims || hasArg bench || hasArg pri
           -DBUILD_CUML_TESTS=${BUILD_CUML_TESTS} \
           -DBUILD_CUML_MPI_COMMS=${BUILD_CUML_MG_TESTS} \
           -DBUILD_CUML_MG_TESTS=${BUILD_CUML_MG_TESTS} \
+          -DDISABLE_FORCE_CLONE_RAFT=${DISABLE_FORCE_CLONE_RAFT} \
           -DNVTX=${NVTX} \
           -DUSE_CCACHE=${CCACHE} \
           -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index b2f199e242..0e66c1b3de 100644
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -15,7 +15,7 @@ cd "$WORKSPACE"
 
 export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
-conda install "ucx-py=0.22.*" "ucx-proc=*=gpu"
+conda install "ucx-py=0.23.*" "ucx-proc=*=gpu"
 
 # Run flake8 and get results/return code
 FLAKE=`flake8 --config=python/setup.cfg`
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 812baa4d9d..551db0c10d 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -7,7 +7,7 @@ set -ex
 
 # Set path and build parallel level
 export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
-export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
+export PARALLEL_LEVEL=${PARALLEL_LEVEL:-8}
 
 # Set home to the job's workspace
 export HOME="$WORKSPACE"
diff --git a/ci/docs/build.sh b/ci/docs/build.sh
index 29c0de5f5b..a71cc790ff 100644
--- a/ci/docs/build.sh
+++ b/ci/docs/build.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #################################
 # cuML Docs build script for CI #
 #################################
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index e646307cf6..d4df7c3968 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -15,7 +15,7 @@ function hasArg {
 
 # Set path and build parallel level
 export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
-export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
+export PARALLEL_LEVEL=${PARALLEL_LEVEL:-8}
 export CONDA_ARTIFACT_PATH=${WORKSPACE}/ci/artifacts/cuml/cpu/.conda-bld/
 
 # Set home to the job's workspace
@@ -53,7 +53,7 @@ gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
       "libcumlprims=${MINOR_VERSION}" \
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \
-      "ucx-py=0.22.*" \
+      "ucx-py=0.23.*" \
       "ucx-proc=*=gpu" \
       "xgboost=1.4.2dev.rapidsai${MINOR_VERSION}" \
       "rapids-build-env=${MINOR_VERSION}.*" \
diff --git a/ci/gpu/test-notebooks.sh b/ci/gpu/test-notebooks.sh
index 342b9a3b5b..fe167d548f 100755
--- a/ci/gpu/test-notebooks.sh
+++ b/ci/gpu/test-notebooks.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 NOTEBOOKS_DIR="$WORKSPACE/notebooks"
 NBTEST="$WORKSPACE/ci/utils/nbtest.sh"
diff --git a/ci/local/build.sh b/ci/local/build.sh
old mode 100644
new mode 100755
index 18ee1cfc53..644647e038
--- a/ci/local/build.sh
+++ b/ci/local/build.sh
@@ -1,5 +1,11 @@
 #!/bin/bash
 
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+##############################################
+# cuML local build and test script for CI    #
+##############################################
+
+
 GIT_DESCRIBE_TAG=`git describe --tags`
 MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index c10694c56e..cf22839bb7 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -31,6 +31,8 @@ function sed_runner() {
 }
 
 sed_runner 's/'"CUML VERSION .* LANGUAGES"'/'"CUML VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt
+sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' cpp/CMakeLists.txt
+
 # RTD update
 sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/source/conf.py
 sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/source/conf.py
diff --git a/ci/utils/nbtest.sh b/ci/utils/nbtest.sh
index 1b39f267c6..28877273bf 100755
--- a/ci/utils/nbtest.sh
+++ b/ci/utils/nbtest.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 MAGIC_OVERRIDE_CODE="
 def my_run_line_magic(*args, **kwargs):
diff --git a/conda/environments/cuml_dev_cuda11.0.yml b/conda/environments/cuml_dev_cuda11.0.yml
index 320a7cb43a..6f622c64dd 100644
--- a/conda/environments/cuml_dev_cuda11.0.yml
+++ b/conda/environments/cuml_dev_cuda11.0.yml
@@ -6,23 +6,23 @@ channels:
 - conda-forge
 dependencies:
 - cudatoolkit=11.0
-- rapids-build-env=21.10.*
-- rapids-notebook-env=21.10.*
-- rapids-doc-env=21.10.*
-- cudf=21.10.*
-- rmm=21.10.*
-- libcumlprims=21.10.*
-- dask-cudf=21.10.*
-- dask-cuda=21.10.*
-- ucx-py=0.22
+- rapids-build-env=21.12.*
+- rapids-notebook-env=21.12.*
+- rapids-doc-env=21.12.*
+- cudf=21.12.*
+- rmm=21.12.*
+- libcumlprims=21.12.*
+- dask-cudf=21.12.*
+- dask-cuda=21.12.*
+- ucx-py=0.23
 - ucx-proc=*=gpu
 - dask-ml
 - doxygen>=1.8.20
 - libfaiss>=1.7.0
 - faiss-proc=*=cuda
 - umap-learn
-- scikit-learn=0.23.1
-- treelite=2.0.0
+- scikit-learn=0.24
+- treelite=2.1.0
 - statsmodels
 - seaborn
 - hdbscan
diff --git a/conda/environments/cuml_dev_cuda11.2.yml b/conda/environments/cuml_dev_cuda11.2.yml
index 7ca5f6bd11..f5828b2412 100644
--- a/conda/environments/cuml_dev_cuda11.2.yml
+++ b/conda/environments/cuml_dev_cuda11.2.yml
@@ -6,23 +6,23 @@ channels:
 - conda-forge
 dependencies:
 - cudatoolkit=11.2
-- rapids-build-env=21.10.*
-- rapids-notebook-env=21.10.*
-- rapids-doc-env=21.10.*
-- cudf=21.10.*
-- rmm=21.10.*
-- libcumlprims=21.10.*
-- dask-cudf=21.10.*
-- dask-cuda=21.10.*
-- ucx-py=0.22
+- rapids-build-env=21.12.*
+- rapids-notebook-env=21.12.*
+- rapids-doc-env=21.12.*
+- cudf=21.12.*
+- rmm=21.12.*
+- libcumlprims=21.12.*
+- dask-cudf=21.12.*
+- dask-cuda=21.12.*
+- ucx-py=0.23
 - ucx-proc=*=gpu
 - dask-ml
 - doxygen>=1.8.20
 - libfaiss>=1.7.0
 - faiss-proc=*=cuda
 - umap-learn
-- scikit-learn=0.23.1
-- treelite=2.0.0
+- scikit-learn=0.24
+- treelite=2.1.0
 - statsmodels
 - seaborn
 - hdbscan
diff --git a/conda/environments/cuml_dev_cuda11.4.yml b/conda/environments/cuml_dev_cuda11.4.yml
index e5a637cb1e..2f0b2192e8 100644
--- a/conda/environments/cuml_dev_cuda11.4.yml
+++ b/conda/environments/cuml_dev_cuda11.4.yml
@@ -6,23 +6,23 @@ channels:
 - conda-forge
 dependencies:
 - cudatoolkit=11.4
-- rapids-build-env=21.10.*
-- rapids-notebook-env=21.10.*
-- rapids-doc-env=21.10.*
-- cudf=21.10.*
-- rmm=21.10.*
-- libcumlprims=21.10.*
-- dask-cudf=21.10.*
-- dask-cuda=21.10.*
-- ucx-py=0.22
+- rapids-build-env=21.12.*
+- rapids-notebook-env=21.12.*
+- rapids-doc-env=21.12.*
+- cudf=21.12.*
+- rmm=21.12.*
+- libcumlprims=21.12.*
+- dask-cudf=21.12.*
+- dask-cuda=21.12.*
+- ucx-py=0.23
 - ucx-proc=*=gpu
 - dask-ml
 - doxygen>=1.8.20
 - libfaiss>=1.7.0
 - faiss-proc=*=cuda
 - umap-learn
-- scikit-learn=0.23.1
-- treelite=2.0.0
+- scikit-learn=0.24
+- treelite=2.1.0
 - statsmodels
 - seaborn
 - hdbscan
diff --git a/conda/recipes/cuml/meta.yaml b/conda/recipes/cuml/meta.yaml
index 485d65d6ea..8788ff99f2 100644
--- a/conda/recipes/cuml/meta.yaml
+++ b/conda/recipes/cuml/meta.yaml
@@ -28,12 +28,12 @@ requirements:
     - setuptools
     - cython>=0.29,<0.30
     - cmake>=3.20.1
-    - treelite=2.0.0
+    - treelite=2.1.0
     - cudf {{ minor_version }}
     - libcuml={{ version }}
     - libcumlprims {{ minor_version }}
     - cudatoolkit {{ cuda_version }}.*
-    - ucx-py 0.22
+    - ucx-py 0.23
     - ucx-proc=*=gpu
   run:
     - python x.x
@@ -42,12 +42,12 @@ requirements:
     - libcuml={{ version }}
     - libcumlprims {{ minor_version }}
     - cupy>=7.8.0,<10.0.0a0
-    - treelite=2.0.0
+    - treelite=2.1.0
     - nccl>=2.9.9
-    - ucx-py 0.22
+    - ucx-py 0.23
     - ucx-proc=*=gpu
-    - dask>=2021.6.0
-    - distributed>=2021.6.0
+    - dask>=2021.09.1
+    - distributed>=2021.09.1
     - joblib >=0.11
     - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
 
diff --git a/conda/recipes/libcuml/meta.yaml b/conda/recipes/libcuml/meta.yaml
index 644d7f22ed..3a3588fadd 100644
--- a/conda/recipes/libcuml/meta.yaml
+++ b/conda/recipes/libcuml/meta.yaml
@@ -39,11 +39,11 @@ requirements:
     - nccl>=2.9.9
     - cudf {{ minor_version }}
     - cudatoolkit {{ cuda_version }}.*
-    - ucx-py 0.22
+    - ucx-py 0.23
     - ucx-proc=*=gpu
     - libcumlprims {{ minor_version }}
     - lapack
-    - treelite=2.0.0
+    - treelite=2.1.0
     - faiss-proc=*=cuda
     - gtest=1.10.0
     - gmock
@@ -52,10 +52,10 @@ requirements:
     - libcumlprims {{ minor_version }}
     - cudf {{ minor_version }}
     - nccl>=2.9.9
-    - ucx-py 0.22
+    - ucx-py 0.23
     - ucx-proc=*=gpu
     - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
-    - treelite=2.0.0
+    - treelite=2.1.0
     - faiss-proc=*=cuda
     - libfaiss 1.7.0 *_cuda
 
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 209cfd29db..332f23a43b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -26,7 +26,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(CUML)
 
-project(CUML VERSION 21.10.00 LANGUAGES CXX CUDA)
+project(CUML VERSION 21.12.00 LANGUAGES CXX CUDA)
 
 # Write the version header
 rapids_cmake_write_version_file(include/cuml/version_config.hpp)
@@ -57,12 +57,17 @@ option(CUDA_ENABLE_KERNEL_INFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINE_INFO "Enable lineinfo in nvcc" OFF)
 option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON)
 option(DISABLE_DEPRECATION_WARNINGS "Disable depreaction warnings " ON)
+option(DISABLE_FORCE_CLONE_RAFT "By default, CPM will clone RAFT even if it's already in the environment. Set to disable that behavior." OFF)
 option(DISABLE_OPENMP "Disable OpenMP" OFF)
 option(ENABLE_CUMLPRIMS_MG "Enable algorithms that use libcumlprims_mg" ON)
 option(NVTX "Enable nvtx markers" OFF)
 option(SINGLEGPU "Disable all mnmg components and comms libraries" OFF)
 option(USE_CCACHE "Cache build artifacts with ccache" OFF)
 
+set(CUML_CPP_ALGORITHMS "ALL" CACHE STRING "Experimental: Choose which algorithms are built into libcuml++.so. Only 'FIL' and 'ALL' are supported right now.")
+set_property(CACHE CUML_CPP_ALGORITHMS PROPERTY STRINGS "ALL" "FIL")
+message(VERBOSE "CUML: Building libcuml++ with algoriths: '${CUML_CPP_ALGORITHMS}'.")
+
 message(VERBOSE "CUML: Building libcuml_c shared library. Contains the cuML C API: ${BUILD_CUML_C_LIBRARY}")
 message(VERBOSE "CUML: Building libcuml shared library: ${BUILD_CUML_CPP_LIBRARY}")
 message(VERBOSE "CUML: Building cuML algorithm tests: ${BUILD_CUML_TESTS}")
@@ -147,6 +152,15 @@ if(NOT BUILD_CUML_CPP_LIBRARY)
   set(BUILD_CUML_EXAMPLES OFF)
 endif()
 
+if(CUML_CPP_ALGORITHMS STREQUAL "ALL")
+  set(LINK_FAISS ON)
+elseif(CUML_CPP_ALGORITHMS STREQUAL "FIL")
+  set(SINGLEGPU ON)
+  set(BUILD_CUML_C_LIBRARY OFF)
+  set(BUILD_CUML_EXAMPLES OFF)
+  set(LINK_FAISS OFF)
+endif()
+
 # SingleGPU build disables cumlprims_mg and comms components
 if(SINGLEGPU)
   message(STATUS "Detected SINGLEGPU build option")
@@ -177,7 +191,11 @@ rapids_cpm_init()
 
 include(cmake/thirdparty/get_thrust.cmake)
 include(cmake/thirdparty/get_rmm.cmake)
-include(cmake/thirdparty/get_faiss.cmake)
+
+if(LINK_FAISS)
+  include(cmake/thirdparty/get_faiss.cmake)
+endif()
+
 include(cmake/thirdparty/get_treelite.cmake)
 include(cmake/thirdparty/get_raft.cmake)
 
@@ -215,64 +233,84 @@ endif()
 if(BUILD_CUML_CPP_LIBRARY)
 
   # single GPU components
+
+  # common components
   add_library(${CUML_CPP_TARGET} SHARED
-    src/arima/batched_arima.cu
-    src/arima/batched_kalman.cu
-    src/common/logger.cpp
-    src/common/nvtx.cu
-    src/datasets/make_arima.cu
-    src/datasets/make_blobs.cu
-    src/datasets/make_regression.cu
-    src/dbscan/dbscan.cu
-    src/decisiontree/decisiontree.cu
-    src/explainer/kernel_shap.cu
-    src/explainer/permutation_shap.cu
-    src/fil/fil.cu
-    src/fil/infer.cu
-    src/glm/glm.cu
-    src/genetic/genetic.cu
-    src/genetic/node.cu
-    src/hdbscan/hdbscan.cu
-    src/hdbscan/condensed_hierarchy.cu
-    src/holtwinters/holtwinters.cu
-    src/kmeans/kmeans.cu
-    src/knn/knn.cu
-    src/knn/knn_sparse.cu
-    src/hierarchy/linkage.cu
-    src/metrics/accuracy_score.cu
-    src/metrics/adjusted_rand_index.cu
-    src/metrics/completeness_score.cu
-    src/metrics/entropy.cu
-    src/metrics/homogeneity_score.cu
-    src/metrics/kl_divergence.cu
-    src/metrics/mutual_info_score.cu
-    src/metrics/pairwise_distance.cu
-    src/metrics/pairwise_distance_canberra.cu
-    src/metrics/pairwise_distance_chebyshev.cu
-    src/metrics/pairwise_distance_cosine.cu
-    src/metrics/pairwise_distance_euclidean.cu
-    src/metrics/pairwise_distance_hellinger.cu
-    src/metrics/pairwise_distance_l1.cu
-    src/metrics/pairwise_distance_minkowski.cu
-    src/metrics/r2_score.cu
-    src/metrics/rand_index.cu
-    src/metrics/silhouette_score.cu
-    src/metrics/trustworthiness.cu
-    src/metrics/v_measure.cu
-    src/pca/pca.cu
-    src/randomforest/randomforest.cu
-    src/random_projection/rproj.cu
-    src/solver/lars.cu
-    src/solver/solver.cu
-    src/spectral/spectral.cu
-    src/svm/svc.cu
-    src/svm/svr.cu
-    src/svm/ws_util.cu
-    src/tsa/auto_arima.cu
-    src/tsa/stationarity.cu
-    src/tsne/tsne.cu
-    src/tsvd/tsvd.cu
-    src/umap/umap.cu)
+              src/common/logger.cpp
+              src/common/nvtx.cu)
+
+
+  # FIL components
+  target_sources(${CUML_CPP_TARGET}
+    PRIVATE
+      src/fil/fil.cu
+      src/fil/infer.cu)
+
+  # Rest of the algorithms
+  if(CUML_CPP_ALGORITHMS STREQUAL "ALL")
+    target_sources(${CUML_CPP_TARGET}
+      PRIVATE
+        src/arima/batched_arima.cu
+        src/arima/batched_kalman.cu
+        src/datasets/make_arima.cu
+        src/datasets/make_blobs.cu
+        src/datasets/make_regression.cu
+        src/dbscan/dbscan.cu
+        src/decisiontree/decisiontree.cu
+        src/explainer/kernel_shap.cu
+        src/explainer/permutation_shap.cu
+        src/fil/fil.cu
+        src/fil/infer.cu
+        src/glm/glm.cu
+        src/genetic/genetic.cu
+        src/genetic/node.cu
+        src/hdbscan/hdbscan.cu
+        src/hdbscan/condensed_hierarchy.cu
+        src/holtwinters/holtwinters.cu
+        src/kmeans/kmeans.cu
+        src/knn/knn.cu
+        src/knn/knn_sparse.cu
+        src/hierarchy/linkage.cu
+        src/metrics/accuracy_score.cu
+        src/metrics/adjusted_rand_index.cu
+        src/metrics/completeness_score.cu
+        src/metrics/entropy.cu
+        src/metrics/homogeneity_score.cu
+        src/metrics/kl_divergence.cu
+        src/metrics/mutual_info_score.cu
+        src/metrics/pairwise_distance.cu
+        src/metrics/pairwise_distance_canberra.cu
+        src/metrics/pairwise_distance_chebyshev.cu
+        src/metrics/pairwise_distance_correlation.cu
+        src/metrics/pairwise_distance_cosine.cu
+        src/metrics/pairwise_distance_euclidean.cu
+        src/metrics/pairwise_distance_hamming.cu
+        src/metrics/pairwise_distance_hellinger.cu
+        src/metrics/pairwise_distance_jensen_shannon.cu
+        src/metrics/pairwise_distance_kl_divergence.cu
+        src/metrics/pairwise_distance_l1.cu
+        src/metrics/pairwise_distance_minkowski.cu
+        src/metrics/pairwise_distance_russell_rao.cu
+        src/metrics/r2_score.cu
+        src/metrics/rand_index.cu
+        src/metrics/silhouette_score.cu
+        src/metrics/trustworthiness.cu
+        src/metrics/v_measure.cu
+        src/pca/pca.cu
+        src/randomforest/randomforest.cu
+        src/random_projection/rproj.cu
+        src/solver/lars.cu
+        src/solver/solver.cu
+        src/spectral/spectral.cu
+        src/svm/svc.cu
+        src/svm/svr.cu
+        src/svm/ws_util.cu
+        src/tsa/auto_arima.cu
+        src/tsa/stationarity.cu
+        src/tsne/tsne.cu
+        src/tsvd/tsvd.cu
+        src/umap/umap.cu)
+  endif()
 
   # multi GPU components
   # todo: separate mnmg that require cumlprims from those that don't
@@ -346,7 +384,7 @@ if(BUILD_CUML_CPP_LIBRARY)
       CUDA::cudart
       CUDA::cusparse
       $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
-      FAISS::FAISS
+      $<$<BOOL:${LINK_FAISS}>:FAISS::FAISS>
       $<IF:$<BOOL:${Treelite_ADDED}>,treelite::treelite_static,treelite::treelite>
       $<IF:$<BOOL:${Treelite_ADDED}>,treelite::treelite_runtime_static,treelite::treelite_runtime>
       $<$<BOOL:${OpenMP_FOUND}>:OpenMP::OpenMP_CXX>
@@ -451,6 +489,11 @@ Provide targets for cuML.
 cuML is a suite of libraries that implement machine learning algorithms and mathematical primitives
 functions that share compatible APIs with other RAPIDS projects.
 
+]=])
+
+set(code_string
+[=[
+thrust_create_target(cuml::Thrust FROM_OPTIONS)
 ]=])
 
  rapids_export(INSTALL cuml
@@ -458,11 +501,12 @@ functions that share compatible APIs with other RAPIDS projects.
     GLOBAL_TARGETS cuml
     NAMESPACE cuml::
     DOCUMENTATION doc_string
+    FINAL_CODE_BLOCK code_string
     )
 
 ################################################################################################
 # - build export -------------------------------------------------------------------------------
-set(code_string [=[thrust_create_target(cuml::Thrust FROM_OPTIONS)]=])
+
 rapids_export(BUILD cuml
     EXPORT_SET cuml-exports
     GLOBAL_TARGETS cuml
diff --git a/cpp/bench/prims/distance_common.cuh b/cpp/bench/prims/distance_common.cuh
index cc4eff27db..6bd8640a50 100644
--- a/cpp/bench/prims/distance_common.cuh
+++ b/cpp/bench/prims/distance_common.cuh
@@ -16,7 +16,7 @@
 
 #include <raft/cudart_utils.h>
 #include <common/ml_benchmark.hpp>
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 
 namespace MLCommon {
 namespace Bench {
diff --git a/cpp/bench/prims/fused_l2_nn.cu b/cpp/bench/prims/fused_l2_nn.cu
index ef21a03881..bd011cb145 100644
--- a/cpp/bench/prims/fused_l2_nn.cu
+++ b/cpp/bench/prims/fused_l2_nn.cu
@@ -17,7 +17,8 @@
 #include <raft/cudart_utils.h>
 #include <common/ml_benchmark.hpp>
 #include <limits>
-#include <raft/distance/fused_l2_nn.cuh>
+#include <raft/distance/fused_l2_nn.hpp>
+#include <raft/handle.hpp>
 #include <raft/linalg/norm.cuh>
 #include <raft/random/rng.cuh>
 
@@ -43,13 +44,15 @@ struct FusedL2NN : public Fixture {
     alloc(out, params.m);
     alloc(workspace, params.m);
     raft::random::Rng r(123456ULL);
+    raft::handle_t handle;
+    handle.set_stream(stream);
+
     r.uniform(x, params.m * params.k, T(-1.0), T(1.0), stream);
     r.uniform(y, params.n * params.k, T(-1.0), T(1.0), stream);
     raft::linalg::rowNorm(xn, x, params.k, params.m, raft::linalg::L2Norm, true, stream);
     raft::linalg::rowNorm(yn, y, params.k, params.n, raft::linalg::L2Norm, true, stream);
-    auto blks = raft::ceildiv(params.m, 256);
-    raft::distance::initKernel<T, cub::KeyValuePair<int, T>, int>
-      <<<blks, 256, 0, stream>>>(out, params.m, std::numeric_limits<T>::max(), op);
+    raft::distance::initialize<T, cub::KeyValuePair<int, T>, int>(
+      handle, out, params.m, std::numeric_limits<T>::max(), op);
   }
 
   void deallocateBuffers(const ::benchmark::State& state) override
diff --git a/cpp/bench/sg/arima_loglikelihood.cu b/cpp/bench/sg/arima_loglikelihood.cu
index 2f7cce35eb..3cd4855f0b 100644
--- a/cpp/bench/sg/arima_loglikelihood.cu
+++ b/cpp/bench/sg/arima_loglikelihood.cu
@@ -44,7 +44,6 @@ class ArimaLoglikelihood : public TsFixtureRandom<DataT> {
       order(p.order),
       param(0, rmm::cuda_stream_default),
       loglike(0, rmm::cuda_stream_default),
-      residual(0, rmm::cuda_stream_default),
       temp_mem(0, rmm::cuda_stream_default)
   {
   }
@@ -85,7 +84,6 @@ class ArimaLoglikelihood : public TsFixtureRandom<DataT> {
                       order,
                       param.data(),
                       loglike.data(),
-                      residual.data(),
                       true,
                       false);
     });
@@ -101,9 +99,8 @@ class ArimaLoglikelihood : public TsFixtureRandom<DataT> {
     // Buffer for the model parameters
     param.resize(order.complexity() * this->params.batch_size, stream);
 
-    // Buffers for the log-likelihood and residuals
+    // Buffers for the log-likelihood
     loglike.resize(this->params.batch_size, stream);
-    residual.resize(this->params.batch_size * this->params.n_obs, stream);
 
     // Temporary memory
     size_t temp_buf_size =
@@ -117,7 +114,6 @@ class ArimaLoglikelihood : public TsFixtureRandom<DataT> {
   ARIMAOrder order;
   rmm::device_uvector<DataT> param;
   rmm::device_uvector<DataT> loglike;
-  rmm::device_uvector<DataT> residual;
   rmm::device_uvector<char> temp_mem;
 };
 
diff --git a/cpp/bench/sg/fil.cu b/cpp/bench/sg/fil.cu
index 2b92aa6364..d9bbd6c873 100644
--- a/cpp/bench/sg/fil.cu
+++ b/cpp/bench/sg/fil.cu
@@ -81,7 +81,7 @@ class FIL : public RegressionFixture<float> {
     fit(*handle, mPtr, data.X.data(), train_nrows, params.ncols, data.y.data(), p_rest.rf);
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
-    ML::build_treelite_forest(&model, &rf_model, params.ncols, params.nclasses > 1 ? 2 : 1);
+    ML::build_treelite_forest(&model, &rf_model, params.ncols);
     ML::fil::treelite_params_t tl_params = {
       .algo              = p_rest.algo,
       .output_class      = params.nclasses > 1,    // cuML RF forest
diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
index abef8830d4..b0a053b582 100644
--- a/cpp/cmake/thirdparty/get_raft.cmake
+++ b/cpp/cmake/thirdparty/get_raft.cmake
@@ -20,6 +20,11 @@ function(find_and_configure_raft)
     cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
                           "${multiValueArgs}" ${ARGN} )
 
+    if(DEFINED CPM_raft_SOURCE OR NOT DISABLE_FORCE_CLONE_RAFT)
+      set(CPM_DL_ALL_CACHE ${CPM_DOWNLOAD_ALL})
+      set(CPM_DOWNLOAD_ALL ON)
+    endif()
+
     rapids_cpm_find(raft ${PKG_VERSION}
       GLOBAL_TARGETS      raft::raft
       BUILD_EXPORT_SET    cuml-exports
@@ -32,7 +37,15 @@ function(find_and_configure_raft)
               "BUILD_TESTS OFF"
     )
 
-    message(VERBOSE "CUML: Using RAFT located in ${raft_SOURCE_DIR}")
+    if(raft_ADDED)
+      message(VERBOSE "CUML: Using RAFT located in ${raft_SOURCE_DIR}")
+    else()
+      message(VERBOSE "CUML: Using RAFT located in ${raft_DIR}")
+    endif()
+
+    if(DEFINED CPM_raft_SOURCE OR NOT DISABLE_FORCE_CLONE_RAFT)
+      set(CPM_DOWNLOAD_ALL ${CPM_DL_ALL_CACHE})
+    endif()
 
 endfunction()
 
@@ -45,4 +58,4 @@ set(CUML_BRANCH_VERSION_raft "${CUML_VERSION_MAJOR}.${CUML_VERSION_MINOR}")
 find_and_configure_raft(VERSION    ${CUML_MIN_VERSION_raft}
                         FORK       rapidsai
                         PINNED_TAG branch-${CUML_BRANCH_VERSION_raft}
-                        )
+                        )
\ No newline at end of file
diff --git a/cpp/cmake/thirdparty/get_treelite.cmake b/cpp/cmake/thirdparty/get_treelite.cmake
index adf7958c7f..171706ea20 100644
--- a/cpp/cmake/thirdparty/get_treelite.cmake
+++ b/cpp/cmake/thirdparty/get_treelite.cmake
@@ -54,5 +54,5 @@ function(find_and_configure_treelite)
 
 endfunction()
 
-find_and_configure_treelite(VERSION     2.0.0
-                        PINNED_TAG  b117da58d7d9a5cc54aa3711e5ad9a8407734c6e)
+find_and_configure_treelite(VERSION     2.1.0
+                        PINNED_TAG  e5248931c62e3807248e0b150e27b2530a510634)
diff --git a/cpp/include/cuml/common/pinned_host_vector.hpp b/cpp/include/cuml/common/pinned_host_vector.hpp
new file mode 100644
index 0000000000..b29527a893
--- /dev/null
+++ b/cpp/include/cuml/common/pinned_host_vector.hpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/mr/host/pinned_memory_resource.hpp>
+
+namespace ML {
+
+template <typename T>
+class pinned_host_vector {
+ public:
+  pinned_host_vector() = default;
+
+  explicit pinned_host_vector(std::size_t n)
+    : size_{n}, data_{static_cast<T*>(pinned_mr.allocate(n * sizeof(T)))}
+  {
+    std::uninitialized_fill(data_, data_ + n, static_cast<T>(0));
+  }
+  ~pinned_host_vector() { pinned_mr.deallocate(data_, size_ * sizeof(T)); }
+
+  pinned_host_vector(pinned_host_vector const&) = delete;
+  pinned_host_vector(pinned_host_vector&&)      = delete;
+  pinned_host_vector& operator=(pinned_host_vector const&) = delete;
+  pinned_host_vector& operator=(pinned_host_vector&&) = delete;
+
+  void resize(std::size_t n)
+  {
+    size_ = n;
+    data_ = static_cast<T*>(pinned_mr.allocate(n * sizeof(T)));
+    std::uninitialized_fill(data_, data_ + n, static_cast<T>(0));
+  }
+
+  T* data() { return data_; }
+
+  T* begin() { return data_; }
+
+  T* end() { return data_ + size_; }
+
+  std::size_t size() { return size_; }
+
+  T operator[](std::size_t idx) const { return *(data_ + idx); }
+  T& operator[](std::size_t idx) { return *(data_ + idx); }
+
+ private:
+  rmm::mr::pinned_memory_resource pinned_mr{};
+  T* data_;
+  std::size_t size_;
+};
+
+}  // namespace ML
\ No newline at end of file
diff --git a/cpp/include/cuml/decomposition/params.hpp b/cpp/include/cuml/decomposition/params.hpp
index 4c81d50abb..0a86db5d71 100644
--- a/cpp/include/cuml/decomposition/params.hpp
+++ b/cpp/include/cuml/decomposition/params.hpp
@@ -38,8 +38,6 @@ class params {
 
 class paramsSolver : public params {
  public:
-  int n_rows;
-  int n_cols;
   // math_t tol = 0.0;
   float tol        = 0.0;
   int n_iterations = 15;
diff --git a/cpp/include/cuml/ensemble/randomforest.hpp b/cpp/include/cuml/ensemble/randomforest.hpp
index 2d530f2314..20d9360e69 100644
--- a/cpp/include/cuml/ensemble/randomforest.hpp
+++ b/cpp/include/cuml/ensemble/randomforest.hpp
@@ -130,8 +130,7 @@ std::string get_rf_json(const RandomForestMetaData<T, L>* forest);
 template <class T, class L>
 void build_treelite_forest(ModelHandle* model,
                            const RandomForestMetaData<T, L>* forest,
-                           int num_features,
-                           int task_category);
+                           int num_features);
 
 ModelHandle concatenate_trees(std::vector<ModelHandle> treelite_handles);
 
diff --git a/cpp/include/cuml/fil/fil.h b/cpp/include/cuml/fil/fil.h
index 7b35e3d789..ad18b4ed1d 100644
--- a/cpp/include/cuml/fil/fil.h
+++ b/cpp/include/cuml/fil/fil.h
@@ -72,6 +72,9 @@ struct forest;
 /** forest_t is the predictor handle */
 typedef forest* forest_t;
 
+/** MAX_N_ITEMS determines the maximum allowed value for tl_params::n_items */
+constexpr int MAX_N_ITEMS = 4;
+
 /** treelite_params_t are parameters for importing treelite models */
 struct treelite_params_t {
   // algo is the inference algorithm
@@ -94,7 +97,7 @@ struct treelite_params_t {
   // can only be a power of 2
   int threads_per_tree;
   // n_items is how many input samples (items) any thread processes. If 0 is given,
-  // choose most (up to 4) that fit into shared memory.
+  // choose most (up to MAX_N_ITEMS) that fit into shared memory.
   int n_items;
   // if non-nullptr, *pforest_shape_str will be set to caller-owned string that
   // contains forest shape
diff --git a/cpp/include/cuml/manifold/umap.hpp b/cpp/include/cuml/manifold/umap.hpp
index fded4eba29..a5b32537ad 100644
--- a/cpp/include/cuml/manifold/umap.hpp
+++ b/cpp/include/cuml/manifold/umap.hpp
@@ -18,10 +18,15 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <memory>
 
 namespace raft {
 class handle_t;
-}
+namespace sparse {
+template <typename T, typename Index_Type>
+class COO;
+};
+}  // namespace raft
 
 namespace ML {
 class UMAPParams;
@@ -69,6 +74,46 @@ void fit(const raft::handle_t& handle,
          UMAPParams* params,
          float* embeddings);
 
+/**
+ * refine performs a UMAP fit on existing embeddings without reinitializing them, which enables
+ * iterative fitting without callbacks.
+ *
+ * @param handle: raft::handle_t
+ * @param X: pointer to input array
+ * @param n: n_samples of input array
+ * @param d: n_features of input array
+ * @param cgraph_coo: pointer to raft::sparse::COO object computed using ML::UMAP::get_graph
+ * @param params: pointer to ML::UMAPParams object
+ * @param embeddings: pointer to current embedding with shape n * n_components, stores updated
+ * embeddings on executing refine
+ */
+void refine(const raft::handle_t& handle,
+            float* X,  // input matrix
+            int n,
+            int d,
+            raft::sparse::COO<float, int>* cgraph_coo,
+            UMAPParams* params,
+            float* embeddings);
+
+/**
+ * returns a simplical set as a raft::sparse:COO object to be consumed by the ML::UMAP::refine
+ * function.
+ *
+ * @param handle: raft::handle_t
+ * @param X: pointer to input array
+ * @param y: pointer to labels array
+ * @param n: n_samples of input array
+ * @param d: n_features of input array
+ * @param params: pointer to ML::UMAPParams object
+ * @return: simplical set (pointer to raft::sparse::COO object)
+ */
+std::unique_ptr<raft::sparse::COO<float, int>> get_graph(const raft::handle_t& handle,
+                                                         float* X,  // input matrix
+                                                         float* y,  // labels
+                                                         int n,
+                                                         int d,
+                                                         UMAPParams* params);
+
 void fit_sparse(const raft::handle_t& handle,
                 int* indptr,  // input matrix
                 int* indices,
diff --git a/cpp/include/cuml/neighbors/knn.hpp b/cpp/include/cuml/neighbors/knn.hpp
index b236aff698..08f726c6af 100644
--- a/cpp/include/cuml/neighbors/knn.hpp
+++ b/cpp/include/cuml/neighbors/knn.hpp
@@ -18,6 +18,7 @@
 
 #include <raft/linalg/distance_type.h>
 #include <raft/spatial/knn/ann_common.h>
+#include <raft/spatial/knn/ball_cover_common.h>
 
 namespace raft {
 class handle_t;
@@ -60,6 +61,16 @@ void brute_force_knn(const raft::handle_t& handle,
                      raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
                      float metric_arg                    = 2.0f);
 
+void rbc_build_index(const raft::handle_t& handle,
+                     raft::spatial::knn::BallCoverIndex<int64_t, float, uint32_t>& index);
+
+void rbc_knn_query(const raft::handle_t& handle,
+                   raft::spatial::knn::BallCoverIndex<int64_t, float, uint32_t>& index,
+                   uint32_t k,
+                   const float* search_items,
+                   uint32_t n_search_items,
+                   int64_t* out_inds,
+                   float* out_dists);
 /**
  * @brief Flat C++ API function to build an approximate nearest neighbors index
  * from an index array and a set of parameters.
diff --git a/cpp/include/cuml/tree/algo_helper.h b/cpp/include/cuml/tree/algo_helper.h
index 28b4ac0e5d..483f936118 100644
--- a/cpp/include/cuml/tree/algo_helper.h
+++ b/cpp/include/cuml/tree/algo_helper.h
@@ -22,6 +22,9 @@ enum CRITERION {
   ENTROPY,
   MSE,
   MAE,
+  POISSON,
+  GAMMA,
+  INVERSE_GAUSSIAN,
   CRITERION_END,
 };
 
diff --git a/cpp/include/cuml/tree/decisiontree.hpp b/cpp/include/cuml/tree/decisiontree.hpp
index ff8b0dc5f0..803a1f1c1e 100644
--- a/cpp/include/cuml/tree/decisiontree.hpp
+++ b/cpp/include/cuml/tree/decisiontree.hpp
@@ -103,7 +103,9 @@ struct TreeMetaDataNode {
   int depth_counter;
   int leaf_counter;
   double train_time;
+  std::vector<T> vector_leaf;
   std::vector<SparseTreeNode<T, L>> sparsetree;
+  int num_outputs;
 };
 
 /**
diff --git a/cpp/include/cuml/tree/flatnode.h b/cpp/include/cuml/tree/flatnode.h
index bae180515d..57c0deaefb 100644
--- a/cpp/include/cuml/tree/flatnode.h
+++ b/cpp/include/cuml/tree/flatnode.h
@@ -33,20 +33,14 @@
 template <typename DataT, typename LabelT, typename IdxT = int>
 struct SparseTreeNode {
  private:
-  LabelT prediction     = LabelT(0);
   IdxT colid            = 0;
   DataT quesval         = DataT(0);
   DataT best_metric_val = DataT(0);
   IdxT left_child_id    = -1;
   IdxT instance_count   = 0;
-  FLATNODE_HD SparseTreeNode(LabelT prediction,
-                             IdxT colid,
-                             DataT quesval,
-                             DataT best_metric_val,
-                             int64_t left_child_id,
-                             IdxT instance_count)
-    : prediction(prediction),
-      colid(colid),
+  FLATNODE_HD SparseTreeNode(
+    IdxT colid, DataT quesval, DataT best_metric_val, int64_t left_child_id, IdxT instance_count)
+    : colid(colid),
       quesval(quesval),
       best_metric_val(best_metric_val),
       left_child_id(left_child_id),
@@ -55,7 +49,6 @@ struct SparseTreeNode {
   }
 
  public:
-  FLATNODE_HD LabelT Prediction() const { return prediction; }
   FLATNODE_HD IdxT ColumnId() const { return colid; }
   FLATNODE_HD DataT QueryValue() const { return quesval; }
   FLATNODE_HD DataT BestMetric() const { return best_metric_val; }
@@ -67,17 +60,17 @@ struct SparseTreeNode {
     IdxT colid, DataT quesval, DataT best_metric_val, int64_t left_child_id, IdxT instance_count)
   {
     return SparseTreeNode<DataT, LabelT>{
-      LabelT(0), colid, quesval, best_metric_val, left_child_id, instance_count};
+      colid, quesval, best_metric_val, left_child_id, instance_count};
   }
-  FLATNODE_HD static SparseTreeNode CreateLeafNode(LabelT prediction, IdxT instance_count)
+  FLATNODE_HD static SparseTreeNode CreateLeafNode(IdxT instance_count)
   {
-    return SparseTreeNode<DataT, LabelT>{prediction, 0, 0, 0, -1, instance_count};
+    return SparseTreeNode<DataT, LabelT>{0, 0, 0, -1, instance_count};
   }
   FLATNODE_HD bool IsLeaf() const { return left_child_id == -1; }
   bool operator==(const SparseTreeNode& other) const
   {
-    return (this->prediction == other.prediction) && (this->colid == other.colid) &&
-           (this->quesval == other.quesval) && (this->best_metric_val == other.best_metric_val) &&
+    return (this->colid == other.colid) && (this->quesval == other.quesval) &&
+           (this->best_metric_val == other.best_metric_val) &&
            (this->left_child_id == other.left_child_id) &&
            (this->instance_count == other.instance_count);
   }
diff --git a/cpp/include/cuml/tsa/arima_common.h b/cpp/include/cuml/tsa/arima_common.h
index 67c4874328..2ed9da31e2 100644
--- a/cpp/include/cuml/tsa/arima_common.h
+++ b/cpp/include/cuml/tsa/arima_common.h
@@ -200,9 +200,9 @@ struct ARIMAMemory {
   T *params_mu, *params_ar, *params_ma, *params_sar, *params_sma, *params_sigma2, *Tparams_mu,
     *Tparams_ar, *Tparams_ma, *Tparams_sar, *Tparams_sma, *Tparams_sigma2, *d_params, *d_Tparams,
     *Z_dense, *R_dense, *T_dense, *RQR_dense, *RQ_dense, *P_dense, *alpha_dense, *ImT_dense,
-    *ImT_inv_dense, *v_tmp_dense, *m_tmp_dense, *K_dense, *TP_dense, *vs, *y_diff, *loglike,
-    *loglike_base, *loglike_pert, *x_pert, *F_buffer, *sumLogF_buffer, *sigma2_buffer,
-    *I_m_AxA_dense, *I_m_AxA_inv_dense, *Ts_dense, *RQRs_dense, *Ps_dense;
+    *ImT_inv_dense, *v_tmp_dense, *m_tmp_dense, *K_dense, *TP_dense, *pred, *y_diff, *loglike,
+    *loglike_base, *loglike_pert, *x_pert, *I_m_AxA_dense, *I_m_AxA_inv_dense, *Ts_dense,
+    *RQRs_dense, *Ps_dense;
   T **Z_batches, **R_batches, **T_batches, **RQR_batches, **RQ_batches, **P_batches,
     **alpha_batches, **ImT_batches, **ImT_inv_batches, **v_tmp_batches, **m_tmp_batches,
     **K_batches, **TP_batches, **I_m_AxA_batches, **I_m_AxA_inv_batches, **Ts_batches,
@@ -279,11 +279,8 @@ struct ARIMAMemory {
     append_buffer<assign>(K_batches, batch_size);
     append_buffer<assign>(TP_dense, rd * rd * batch_size);
     append_buffer<assign>(TP_batches, batch_size);
-    append_buffer<assign>(F_buffer, n_obs * batch_size);
-    append_buffer<assign>(sumLogF_buffer, batch_size);
-    append_buffer<assign>(sigma2_buffer, batch_size);
 
-    append_buffer<assign>(vs, n_obs * batch_size);
+    append_buffer<assign>(pred, n_obs * batch_size);
     append_buffer<assign>(y_diff, n_obs * batch_size);
     append_buffer<assign>(loglike, batch_size);
     append_buffer<assign>(loglike_base, batch_size);
diff --git a/cpp/include/cuml/tsa/batched_arima.hpp b/cpp/include/cuml/tsa/batched_arima.hpp
index aa8059eb32..ad3c78a05c 100644
--- a/cpp/include/cuml/tsa/batched_arima.hpp
+++ b/cpp/include/cuml/tsa/batched_arima.hpp
@@ -56,6 +56,15 @@ void unpack(raft::handle_t& handle,
             int batch_size,
             const double* param_vec);
 
+/**
+ * Detect missing observations in a time series
+ *
+ * @param[in]  handle     cuML handle
+ * @param[in]  d_y        Time series
+ * @param[in]  n_elem     Total number of elements in the dataset
+ */
+bool detect_missing(raft::handle_t& handle, const double* d_y, int n_elem);
+
 /**
  * Compute the differenced series (seasonal and/or non-seasonal differences)
  *
@@ -87,9 +96,6 @@ void batched_diff(raft::handle_t& handle,
  * @param[in]  d_params     Parameters to evaluate grouped by series:
  *                          [mu0, ar.., ma.., mu1, ..] (device)
  * @param[out] loglike      Log-Likelihood of the model per series
- * @param[out] d_vs         The residual between model and original signal.
- *                          shape = (n_obs-d-s*D, batch_size) (device)
- *                          Note: no output when using CSS estimation
  * @param[in]  trans        Run `jones_transform` on params.
  * @param[in]  host_loglike Whether loglike is a host pointer
  * @param[in]  method       Whether to use sum-of-squares or Kalman filter
@@ -110,7 +116,6 @@ void batched_loglike(raft::handle_t& handle,
                      const ARIMAOrder& order,
                      const double* d_params,
                      double* loglike,
-                     double* d_vs,
                      bool trans           = true,
                      bool host_loglike    = true,
                      LoglikeMethod method = MLE,
@@ -137,9 +142,6 @@ void batched_loglike(raft::handle_t& handle,
  * @param[in]  order        ARIMA hyper-parameters
  * @param[in]  params       ARIMA parameters (device)
  * @param[out] loglike      Log-Likelihood of the model per series
- * @param[out] d_vs         The residual between model and original signal.
- *                          shape = (n_obs-d-s*D, batch_size) (device)
- *                          Note: no output when using CSS estimation
  * @param[in]  trans        Run `jones_transform` on params.
  * @param[in]  host_loglike Whether loglike is a host pointer
  * @param[in]  method       Whether to use sum-of-squares or Kalman filter
@@ -160,7 +162,6 @@ void batched_loglike(raft::handle_t& handle,
                      const ARIMAOrder& order,
                      const ARIMAParams<double>& params,
                      double* loglike,
-                     double* d_vs,
                      bool trans           = true,
                      bool host_loglike    = true,
                      LoglikeMethod method = MLE,
@@ -277,12 +278,14 @@ void information_criterion(raft::handle_t& handle,
  * @param[in]  n_obs       Number of samples per time series
  *                         (all series must be identical)
  * @param[in]  order       ARIMA hyper-parameters
+ * @param[in]  missing     Are there missing observations?
  */
 void estimate_x0(raft::handle_t& handle,
                  ARIMAParams<double>& params,
                  const double* d_y,
                  int batch_size,
                  int n_obs,
-                 const ARIMAOrder& order);
+                 const ARIMAOrder& order,
+                 bool missing);
 
 }  // namespace ML
diff --git a/cpp/include/cuml/tsa/batched_kalman.hpp b/cpp/include/cuml/tsa/batched_kalman.hpp
index 3b41c0a811..4388dddee5 100644
--- a/cpp/include/cuml/tsa/batched_kalman.hpp
+++ b/cpp/include/cuml/tsa/batched_kalman.hpp
@@ -37,8 +37,7 @@ namespace ML {
  * @param[in]  order         ARIMA hyper-parameters
  * @param[in]  batch_size    Number of series making up the batch
  * @param[out] d_loglike     Resulting log-likelihood (per series) (device)
- * @param[out] d_vs          Residual between the prediction and the
- *                           original series.
+ * @param[out] d_pred        Predictions
  *                           shape=(nobs-d-s*D, batch_size) (device)
  * @param[in]  fc_steps      Number of steps to forecast
  * @param[in]  d_fc          Array to store the forecast
@@ -55,7 +54,7 @@ void batched_kalman_filter(raft::handle_t& handle,
                            const ARIMAOrder& order,
                            int batch_size,
                            double* d_loglike,
-                           double* d_vs,
+                           double* d_pred,
                            int fc_steps    = 0,
                            double* d_fc    = nullptr,
                            double level    = 0,
diff --git a/cpp/src/arima/batched_arima.cu b/cpp/src/arima/batched_arima.cu
index 9ebdd577c4..a73891db6d 100644
--- a/cpp/src/arima/batched_arima.cu
+++ b/cpp/src/arima/batched_arima.cu
@@ -23,6 +23,7 @@
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/logical.h>
 
 #include <cuml/tsa/batched_arima.hpp>
 #include <cuml/tsa/batched_kalman.hpp>
@@ -36,6 +37,7 @@
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <rmm/device_uvector.hpp>
 #include <timeSeries/arima_helpers.cuh>
+#include <timeSeries/fillna.cuh>
 
 namespace ML {
 
@@ -71,6 +73,20 @@ void batched_diff(raft::handle_t& handle,
     d_y_diff, d_y, batch_size, n_obs, order.d, order.D, order.s, stream);
 }
 
+template <typename T>
+struct is_missing {
+  typedef T argument_type;
+  typedef T result_type;
+
+  __thrust_exec_check_disable__ __device__ const T operator()(const T& x) const { return isnan(x); }
+};  // end is_missing
+
+bool detect_missing(raft::handle_t& handle, const double* d_y, int n_elem)
+{
+  return thrust::any_of(
+    thrust::cuda::par.on(handle.get_stream()), d_y, d_y + n_elem, is_missing<double>());
+}
+
 void predict(raft::handle_t& handle,
              const ARIMAMemory<double>& arima_mem,
              const double* d_y,
@@ -108,7 +124,7 @@ void predict(raft::handle_t& handle,
     d_y_kf   = d_y;
   }
 
-  double* d_vs = arima_mem.vs;
+  double* d_pred = arima_mem.pred;
 
   // Create temporary array for the forecasts
   int num_steps = std::max(end - n_obs, 0);
@@ -126,7 +142,6 @@ void predict(raft::handle_t& handle,
                   order_after_prep,
                   params,
                   loglike.data(),
-                  d_vs,
                   false,
                   true,
                   MLE,
@@ -144,13 +159,16 @@ void predict(raft::handle_t& handle,
   // In-sample prediction
   //
 
-  int res_offset = diff ? order.d + order.s * order.D : 0;
-  int p_start    = std::max(start, res_offset);
-  int p_end      = std::min(n_obs, end);
-
   // The prediction loop starts by filling undefined predictions with NaN,
   // then computes the predictions from the observations and residuals
   if (start < n_obs) {
+    int res_offset = diff ? order.d + order.s * order.D : 0;
+    int p_start    = std::max(start, res_offset);
+    int p_end      = std::min(n_obs, end);
+    int dD         = diff ? order.d + order.D : 0;
+    int period1    = order.d ? 1 : order.s;
+    int period2    = order.d == 2 ? 1 : order.s;
+
     thrust::for_each(
       thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
         d_y_p[0] = 0.0;
@@ -158,8 +176,16 @@ void predict(raft::handle_t& handle,
           d_y_p[bid * predict_ld + i] = nan("");
         }
         for (int i = p_start; i < p_end; i++) {
-          d_y_p[bid * predict_ld + i - start] =
-            d_y[bid * n_obs + i] - d_vs[bid * n_obs_kf + i - res_offset];
+          if (dD == 0) {
+            d_y_p[bid * predict_ld + i - start] = d_pred[bid * n_obs + i];
+          } else if (dD == 1) {
+            d_y_p[bid * predict_ld + i - start] =
+              d_y[bid * n_obs + i - period1] + d_pred[bid * n_obs_kf + i - res_offset];
+          } else {
+            d_y_p[bid * predict_ld + i - start] =
+              d_y[bid * n_obs + i - period1] + d_y[bid * n_obs + i - period2] -
+              d_y[bid * n_obs + i - period1 - period2] + d_pred[bid * n_obs_kf + i - res_offset];
+          }
         }
       });
   }
@@ -343,7 +369,6 @@ void batched_loglike(raft::handle_t& handle,
                      const ARIMAOrder& order,
                      const ARIMAParams<double>& params,
                      double* loglike,
-                     double* d_vs,
                      bool trans,
                      bool host_loglike,
                      LoglikeMethod method,
@@ -358,6 +383,8 @@ void batched_loglike(raft::handle_t& handle,
 
   auto stream = handle.get_stream();
 
+  double* d_pred = arima_mem.pred;
+
   ARIMAParams<double> Tparams = {arima_mem.Tparams_mu,
                                  arima_mem.Tparams_ar,
                                  arima_mem.Tparams_ma,
@@ -396,7 +423,7 @@ void batched_loglike(raft::handle_t& handle,
                           order,
                           batch_size,
                           d_loglike,
-                          d_vs,
+                          d_pred,
                           fc_steps,
                           d_fc,
                           level,
@@ -419,7 +446,6 @@ void batched_loglike(raft::handle_t& handle,
                      const ARIMAOrder& order,
                      const double* d_params,
                      double* loglike,
-                     double* d_vs,
                      bool trans,
                      bool host_loglike,
                      LoglikeMethod method,
@@ -452,7 +478,6 @@ void batched_loglike(raft::handle_t& handle,
                   order,
                   params,
                   loglike,
-                  d_vs,
                   trans,
                   host_loglike,
                   method,
@@ -488,7 +513,6 @@ void batched_loglike_grad(raft::handle_t& handle,
   double* d_x_pert = arima_mem.x_pert;
   raft::copy(d_x_pert, d_x, N * batch_size, stream);
 
-  double* d_vs      = arima_mem.vs;
   double* d_ll_base = arima_mem.loglike_base;
   double* d_ll_pert = arima_mem.loglike_pert;
 
@@ -501,7 +525,6 @@ void batched_loglike_grad(raft::handle_t& handle,
                   order,
                   d_x,
                   d_ll_base,
-                  d_vs,
                   trans,
                   false,
                   method,
@@ -523,7 +546,6 @@ void batched_loglike_grad(raft::handle_t& handle,
                     order,
                     d_x_pert,
                     d_ll_pert,
-                    d_vs,
                     trans,
                     false,
                     method,
@@ -558,11 +580,9 @@ void information_criterion(raft::handle_t& handle,
   ML::PUSH_RANGE(__func__);
   auto stream = handle.get_stream();
 
-  double* d_vs = arima_mem.vs;
-
   /* Compute log-likelihood in d_ic */
   batched_loglike(
-    handle, arima_mem, d_y, batch_size, n_obs, order, params, d_ic, d_vs, false, false, MLE);
+    handle, arima_mem, d_y, batch_size, n_obs, order, params, d_ic, false, false, MLE);
 
   /* Compute information criterion from log-likelihood and base term */
   MLCommon::Metrics::Batched::information_criterion(
@@ -835,18 +855,33 @@ void estimate_x0(raft::handle_t& handle,
                  const double* d_y,
                  int batch_size,
                  int n_obs,
-                 const ARIMAOrder& order)
+                 const ARIMAOrder& order,
+                 bool missing)
 {
   ML::PUSH_RANGE(__func__);
   const auto& handle_impl = handle;
   auto stream             = handle_impl.get_stream();
   auto cublas_handle      = handle_impl.get_cublas_handle();
 
+  // Least squares can't deal with missing values: create copy with naive
+  // replacements for missing values
+  const double* d_y_no_missing;
+  rmm::device_uvector<double> y_no_missing(0, stream);
+  if (missing) {
+    y_no_missing.resize(n_obs * batch_size, stream);
+    d_y_no_missing = y_no_missing.data();
+
+    raft::copy(y_no_missing.data(), d_y, n_obs * batch_size, stream);
+    MLCommon::TimeSeries::fillna(y_no_missing.data(), batch_size, n_obs, stream);
+  } else {
+    d_y_no_missing = d_y;
+  }
+
   // Difference if necessary, copy otherwise
   MLCommon::LinAlg::Batched::Matrix<double> bm_yd(
     n_obs - order.d - order.s * order.D, 1, batch_size, cublas_handle, stream, false);
   MLCommon::TimeSeries::prepare_data(
-    bm_yd.raw_data(), d_y, batch_size, n_obs, order.d, order.D, order.s, stream);
+    bm_yd.raw_data(), d_y_no_missing, batch_size, n_obs, order.d, order.D, order.s, stream);
 
   // Do the computation of the initial parameters
   _start_params(handle, params, bm_yd, order);
diff --git a/cpp/src/arima/batched_kalman.cu b/cpp/src/arima/batched_kalman.cu
index 604312faf1..572f0abb09 100644
--- a/cpp/src/arima/batched_kalman.cu
+++ b/cpp/src/arima/batched_kalman.cu
@@ -79,29 +79,48 @@ DI void MM_l(const double* A, const double* B, double* out)
   }
 }
 
+/** Improve stability by making a covariance matrix symmetric and forcing
+ * diagonal elements to be positive
+ */
+template <int n>
+DI void numerical_stability(double* A)
+{
+  // A = 0.5 * (A + A')
+  for (int i = 0; i < n - 1; i++) {
+    for (int j = i + 1; j < n; j++) {
+      double new_val = 0.5 * (A[j * n + i] + A[i * n + j]);
+      A[j * n + i]   = new_val;
+      A[i * n + j]   = new_val;
+    }
+  }
+  // Aii = abs(Aii)
+  for (int i = 0; i < n; i++) {
+    A[i * n + i] = abs(A[i * n + i]);
+  }
+}
+
 /**
  * Kalman loop kernel. Each thread computes kalman filter for a single series
  * and stores relevant matrices in registers.
  *
- * @tparam     r          Dimension of the state vector
- * @param[in]  ys         Batched time series
- * @param[in]  nobs       Number of observation per series
- * @param[in]  T          Batched transition matrix.            (r x r)
- * @param[in]  Z          Batched "design" vector               (1 x r)
- * @param[in]  RQR        Batched R*Q*R'                        (r x r)
- * @param[in]  P          Batched P                             (r x r)
- * @param[in]  alpha      Batched state vector                  (r x 1)
- * @param[in]  intercept  Do we fit an intercept?
- * @param[in]  d_mu       Batched intercept                     (1)
- * @param[in]  batch_size Batch size
- * @param[out] vs         Batched residuals                     (nobs)
- * @param[out] Fs         Batched variance of prediction errors (nobs)
- * @param[out] sum_logFs  Batched sum of the logs of Fs         (1)
- * @param[in]  n_diff       d + s*D
- * @param[in]  fc_steps   Number of steps to forecast
- * @param[out] d_fc       Array to store the forecast
- * @param[in]  conf_int   Whether to compute confidence intervals
- * @param[out] d_F_fc     Batched variance of forecast errors   (fc_steps)
+ * @tparam     r           Dimension of the state vector
+ * @param[in]  ys          Batched time series
+ * @param[in]  nobs        Number of observation per series
+ * @param[in]  T           Batched transition matrix.            (r x r)
+ * @param[in]  Z           Batched "design" vector               (1 x r)
+ * @param[in]  RQR         Batched R*Q*R'                        (r x r)
+ * @param[in]  P           Batched P                             (r x r)
+ * @param[in]  alpha       Batched state vector                  (r x 1)
+ * @param[in]  intercept   Do we fit an intercept?
+ * @param[in]  d_mu        Batched intercept                     (1)
+ * @param[in]  batch_size  Batch size
+ * @param[out] d_pred      Predictions                           (nobs)
+ * @param[out] d_loglike   Log-likelihood                        (1)
+ * @param[in]  n_diff      d + s*D
+ * @param[in]  fc_steps    Number of steps to forecast
+ * @param[out] d_fc        Array to store the forecast
+ * @param[in]  conf_int    Whether to compute confidence intervals
+ * @param[out] d_F_fc      Batched variance of forecast errors   (fc_steps)
  */
 template <int rd>
 __global__ void batched_kalman_loop_kernel(const double* ys,
@@ -114,9 +133,8 @@ __global__ void batched_kalman_loop_kernel(const double* ys,
                                            bool intercept,
                                            const double* d_mu,
                                            int batch_size,
-                                           double* vs,
-                                           double* Fs,
-                                           double* sum_logFs,
+                                           double* d_pred,
+                                           double* d_loglike,
                                            int n_diff,
                                            int fc_steps   = 0,
                                            double* d_fc   = nullptr,
@@ -152,57 +170,74 @@ __global__ void batched_kalman_loop_kernel(const double* ys,
     }
 
     double b_sum_logFs = 0.0;
+    double b_ll_s2     = 0.0;
+    int n_obs_ll       = 0;
     const double* b_ys = ys + bid * nobs;
-    double* b_vs       = vs + bid * nobs;
-    double* b_Fs       = Fs + bid * nobs;
-
-    double mu = intercept ? d_mu[bid] : 0.0;
+    double* b_pred     = d_pred + bid * nobs;
+    double mu          = intercept ? d_mu[bid] : 0.0;
 
     for (int it = 0; it < nobs; it++) {
-      // 1. v = y - Z*alpha
-      double vs_it = b_ys[it];
-      if (n_diff == 0)
-        vs_it -= l_alpha[0];
-      else {
-        for (int i = 0; i < rd; i++) {
-          vs_it -= l_alpha[i] * l_Z[i];
+      double _Fs, vs_it;
+      bool missing;
+      {
+        // 1. v = y - Z*alpha
+        double pred;
+        if (n_diff == 0)
+          pred = l_alpha[0];
+        else {
+          pred = 0.0;
+          for (int i = 0; i < rd; i++) {
+            pred += l_alpha[i] * l_Z[i];
+          }
         }
-      }
-      b_vs[it] = vs_it;
-
-      // 2. F = Z*P*Z'
-      double _Fs;
-      if (n_diff == 0)
-        _Fs = l_P[0];
-      else {
-        _Fs = 0.0;
-        for (int i = 0; i < rd; i++) {
-          for (int j = 0; j < rd; j++) {
-            _Fs += l_P[j * rd + i] * l_Z[i] * l_Z[j];
+        b_pred[it] = pred;
+        double yt  = b_ys[it];
+        missing    = isnan(yt);
+
+        if (!missing) {
+          vs_it = yt - pred;
+
+          // 2. F = Z*P*Z'
+          if (n_diff == 0)
+            _Fs = l_P[0];
+          else {
+            _Fs = 0.0;
+            for (int i = 0; i < rd; i++) {
+              for (int j = 0; j < rd; j++) {
+                _Fs += l_P[j * rd + i] * l_Z[i] * l_Z[j];
+              }
+            }
+          }
+
+          if (it >= n_diff) {
+            b_sum_logFs += log(_Fs);
+            b_ll_s2 += vs_it * vs_it / _Fs;
+            n_obs_ll++;
           }
         }
       }
-      b_Fs[it] = _Fs;
-      if (it >= n_diff) b_sum_logFs += log(_Fs);
 
       // 3. K = 1/Fs[it] * T*P*Z'
       // TP = T*P
       MM_l<rd>(l_T, l_P, l_TP);
-      // K = 1/Fs[it] * TP*Z'
-      double _1_Fs = 1.0 / _Fs;
-      if (n_diff == 0) {
-        for (int i = 0; i < rd; i++) {
-          l_K[i] = _1_Fs * l_TP[i];
+      if (!missing) {
+        // K = 1/Fs[it] * TP*Z'
+        double _1_Fs = 1.0 / _Fs;
+        if (n_diff == 0) {
+          for (int i = 0; i < rd; i++) {
+            l_K[i] = _1_Fs * l_TP[i];
+          }
+        } else {
+          Mv_l<rd>(_1_Fs, l_TP, l_Z, l_K);
         }
-      } else
-        Mv_l<rd>(_1_Fs, l_TP, l_Z, l_K);
+      }
 
       // 4. alpha = T*alpha + K*vs[it] + c
       // tmp = T*alpha
       Mv_l<rd>(l_T, l_alpha, l_tmp);
       // alpha = tmp + K*vs[it]
       for (int i = 0; i < rd; i++) {
-        l_alpha[i] = l_tmp[i] + l_K[i] * vs_it;
+        l_alpha[i] = l_tmp[i] + (missing ? 0.0 : l_K[i] * vs_it);
       }
       // alpha = alpha + c
       l_alpha[n_diff] += mu;
@@ -212,15 +247,17 @@ __global__ void batched_kalman_loop_kernel(const double* ys,
       for (int i = 0; i < rd2; i++) {
         l_tmp[i] = l_T[i];
       }
-      // L = L - K * Z
-      if (n_diff == 0) {
-        for (int i = 0; i < rd; i++) {
-          l_tmp[i] -= l_K[i];
-        }
-      } else {
-        for (int i = 0; i < rd; i++) {
-          for (int j = 0; j < rd; j++) {
-            l_tmp[j * rd + i] -= l_K[i] * l_Z[j];
+      if (!missing) {
+        // L = L - K * Z
+        if (n_diff == 0) {
+          for (int i = 0; i < rd; i++) {
+            l_tmp[i] -= l_K[i];
+          }
+        } else {
+          for (int i = 0; i < rd; i++) {
+            for (int j = 0; j < rd; j++) {
+              l_tmp[j * rd + i] -= l_K[i] * l_Z[j];
+            }
           }
         }
       }
@@ -232,8 +269,17 @@ __global__ void batched_kalman_loop_kernel(const double* ys,
       for (int i = 0; i < rd2; i++) {
         l_P[i] += l_RQR[i];
       }
+
+      // Numerical stability: enforce symmetry of P and positivity of diagonal
+      numerical_stability<rd>(l_P);
+    }
+
+    // Compute log-likelihood
+    {
+      double n_obs_ll_f = static_cast<double>(n_obs_ll);
+      b_ll_s2 /= n_obs_ll_f;
+      d_loglike[bid] = -.5 * (b_sum_logFs + n_obs_ll_f * (b_ll_s2 + log(2 * M_PI)));
     }
-    sum_logFs[bid] = b_sum_logFs;
 
     // Forecast
     {
@@ -279,6 +325,9 @@ __global__ void batched_kalman_loop_kernel(const double* ys,
           for (int i = 0; i < rd2; i++) {
             l_P[i] += l_RQR[i];
           }
+
+          // Numerical stability: enforce symmetry of P and positivity of diagonal
+          numerical_stability<rd>(l_P);
         }
       }
     }
@@ -289,11 +338,12 @@ __global__ void batched_kalman_loop_kernel(const double* ys,
  * This union allows for efficient reuse of shared memory in the Kalman
  * filter.
  */
-template <typename GemmPolicy, typename GemvPolicy, typename T>
+template <typename GemmPolicy, typename GemvPolicy, typename CovPolicy, typename T>
 union KalmanLoopSharedMemory {
   MLCommon::LinAlg::ReductionStorage<GemmPolicy::BlockSize, T> reduction_storage;
   MLCommon::LinAlg::GemmStorage<GemmPolicy, T> gemm_storage;
   MLCommon::LinAlg::GemvStorage<GemvPolicy, T> gemv_storage[2];
+  MLCommon::LinAlg::CovStabilityStorage<CovPolicy, T> cov_stability_storage;
 };
 
 /**
@@ -301,6 +351,7 @@ union KalmanLoopSharedMemory {
  *
  * @tparam     GemmPolicy  Execution policy for GEMM
  * @tparam     GemvPolicy  Execution policy for GEMV
+ * @tparam     CovPolicy   Execution policy for the covariance stability operation
  * @param[in]  d_ys        Batched time series
  * @param[in]  batch_size  Batch size
  * @param[in]  n_obs       Number of observation per series
@@ -314,16 +365,15 @@ union KalmanLoopSharedMemory {
  * @param[in]  intercept   Do we fit an intercept?
  * @param[in]  d_mu        Batched intercept                     (1)
  * @param[in]  rd          State vector dimension
- * @param[out] d_vs        Batched residuals                     (nobs)
- * @param[out] d_Fs        Batched variance of prediction errors (nobs)
- * @param[out] d_sum_logFs Batched sum of the logs of Fs         (1)
+ * @param[out] d_pred      Predictions                           (nobs)
+ * @param[out] d_loglike   Log-likelihood                        (1)
  * @param[in]  n_diff      d + s*D
  * @param[in]  fc_steps    Number of steps to forecast
  * @param[out] d_fc        Array to store the forecast
  * @param[in]  conf_int    Whether to compute confidence intervals
  * @param[out] d_F_fc      Batched variance of forecast errors   (fc_steps)
  */
-template <typename GemmPolicy, typename GemvPolicy>
+template <typename GemmPolicy, typename GemvPolicy, typename CovPolicy>
 __global__ void _batched_kalman_device_loop_large_kernel(const double* d_ys,
                                                          int batch_size,
                                                          int n_obs,
@@ -337,9 +387,8 @@ __global__ void _batched_kalman_device_loop_large_kernel(const double* d_ys,
                                                          bool intercept,
                                                          const double* d_mu,
                                                          int rd,
-                                                         double* d_vs,
-                                                         double* d_Fs,
-                                                         double* d_sum_logFs,
+                                                         double* d_pred,
+                                                         double* d_loglike,
                                                          int n_diff,
                                                          int fc_steps,
                                                          double* d_fc,
@@ -355,7 +404,7 @@ __global__ void _batched_kalman_device_loop_large_kernel(const double* d_ys,
   double* shared_alpha = (double*)(dyna_shared_mem + 2 * rd * sizeof(double));
   double* shared_K     = (double*)(dyna_shared_mem + 3 * rd * sizeof(double));
 
-  __shared__ KalmanLoopSharedMemory<GemmPolicy, GemvPolicy, double> shared_mem;
+  __shared__ KalmanLoopSharedMemory<GemmPolicy, GemvPolicy, CovPolicy, double> shared_mem;
 
   for (int bid = blockIdx.x; bid < batch_size; bid += gridDim.x) {
     /* Load Z and alpha to shared memory */
@@ -369,31 +418,69 @@ __global__ void _batched_kalman_device_loop_large_kernel(const double* d_ys,
     /* Initialization */
     double mu_       = intercept ? d_mu[bid] : 0.0;
     double sum_logFs = 0.0;
+    double ll_s2     = 0.0;
+    int n_obs_ll     = 0;
+    int it;
 
-    /* Kalman loop */
-    for (int it = 0; it < n_obs; it++) {
-      // 1.
-      double vt = d_ys[bid * n_obs + it];
+    /* Skip missing observations at the start */
+    {
+      double pred0;
       if (n_diff == 0) {
-        vt -= shared_alpha[0];
+        pred0 = shared_alpha[0];
       } else {
-        vt -= MLCommon::LinAlg::_block_dot<GemmPolicy::BlockSize, true>(
+        pred0 = 0.0;
+        pred0 += MLCommon::LinAlg::_block_dot<GemmPolicy::BlockSize, true>(
           rd, shared_Z, shared_alpha, shared_mem.reduction_storage);
         __syncthreads();  // necessary to reuse shared memory
       }
-      if (threadIdx.x == 0) d_vs[bid * n_obs + it] = vt;
 
-      // 2.
-      double _F;
-      if (n_diff == 0) {
-        _F = (d_P + bid * rd2)[0];
-      } else {
-        _F = MLCommon::LinAlg::_block_xAxt<GemmPolicy::BlockSize, true, false>(
-          rd, shared_Z, d_P + bid * rd2, shared_mem.reduction_storage);
-        __syncthreads();  // necessary to reuse shared memory
+      for (it = 0; it < n_obs && isnan(d_ys[bid * n_obs + it]); it++) {
+        if (threadIdx.x == 0) d_pred[bid * n_obs + it] = pred0;
+      }
+    }
+
+    /* Kalman loop */
+    for (; it < n_obs; it++) {
+      double vt, _F;
+      bool missing;
+      {
+        // 1. pred = Z*alpha
+        //    v = y - pred
+        double pred;
+        if (n_diff == 0) {
+          pred = shared_alpha[0];
+        } else {
+          pred = 0.0;
+          pred += MLCommon::LinAlg::_block_dot<GemmPolicy::BlockSize, true>(
+            rd, shared_Z, shared_alpha, shared_mem.reduction_storage);
+          __syncthreads();  // necessary to reuse shared memory
+        }
+        double yt = d_ys[bid * n_obs + it];
+        missing   = isnan(yt);
+
+        if (!missing) {
+          vt = yt - pred;
+
+          // 2. F = Z*P*Z'
+          if (n_diff == 0) {
+            _F = (d_P + bid * rd2)[0];
+          } else {
+            _F = MLCommon::LinAlg::_block_xAxt<GemmPolicy::BlockSize, true, false>(
+              rd, shared_Z, d_P + bid * rd2, shared_mem.reduction_storage);
+            __syncthreads();  // necessary to reuse shared memory
+          }
+        }
+
+        if (threadIdx.x == 0) {
+          d_pred[bid * n_obs + it] = pred;
+
+          if (it >= n_diff && !missing) {
+            sum_logFs += log(_F);
+            ll_s2 += vt * vt / _F;
+            n_obs_ll++;
+          }
+        }
       }
-      if (threadIdx.x == 0) d_Fs[bid * n_obs + it] = _F;
-      if (threadIdx.x == 0 && it >= n_diff) sum_logFs += log(_F);
 
       // 3. K = 1/Fs[it] * T*P*Z'
       // TP = T*P (also used later)
@@ -408,13 +495,15 @@ __global__ void _batched_kalman_device_loop_large_kernel(const double* d_ys,
                                                 d_TP + bid * rd2,
                                                 shared_mem.gemm_storage);
       __syncthreads();  // for consistency of TP
-      // K = 1/Fs[it] * TP*Z'
-      double _1_Fs = 1.0 / _F;
-      if (n_diff == 0) {
-        MLCommon::LinAlg::_block_ax(rd, _1_Fs, d_TP + bid * rd2, shared_K);
-      } else {
-        MLCommon::LinAlg::_block_gemv<GemvPolicy, false>(
-          rd, rd, _1_Fs, d_TP + bid * rd2, shared_Z, shared_K, shared_mem.gemv_storage[0]);
+      if (!missing) {
+        // K = 1/Fs[it] * TP*Z'
+        double _1_Fs = 1.0 / _F;
+        if (n_diff == 0) {
+          MLCommon::LinAlg::_block_ax(rd, _1_Fs, d_TP + bid * rd2, shared_K);
+        } else {
+          MLCommon::LinAlg::_block_gemv<GemvPolicy, false>(
+            rd, rd, _1_Fs, d_TP + bid * rd2, shared_Z, shared_K, shared_mem.gemv_storage[0]);
+        }
       }
 
       // 4. alpha = T*alpha + K*vs[it] + c
@@ -425,18 +514,18 @@ __global__ void _batched_kalman_device_loop_large_kernel(const double* d_ys,
       // alpha = vec1 + K*vs[it] + c
       for (int i = threadIdx.x; i < rd; i += GemmPolicy::BlockSize) {
         double c_       = (i == n_diff) ? mu_ : 0.0;
-        shared_alpha[i] = shared_vec0[i] + vt * shared_K[i] + c_;
+        shared_alpha[i] = shared_vec0[i] + c_ + (missing ? 0.0 : vt * shared_K[i]);
       }
 
       // 5. L = T - K * Z
       if (n_diff == 0) {
         for (int i = threadIdx.x; i < rd2; i += GemmPolicy::BlockSize) {
-          double _KZ             = (i < rd) ? shared_K[i] : 0.0;
+          double _KZ             = (i < rd && !missing) ? shared_K[i] : 0.0;
           d_m_tmp[bid * rd2 + i] = d_T[bid * rd2 + i] - _KZ;
         }
       } else {
         for (int i = threadIdx.x; i < rd2; i += GemmPolicy::BlockSize) {
-          double _KZ             = shared_K[i % rd] * shared_Z[i / rd];
+          double _KZ             = missing ? 0.0 : shared_K[i % rd] * shared_Z[i / rd];
           d_m_tmp[bid * rd2 + i] = d_T[bid * rd2 + i] - _KZ;
         }
       }
@@ -455,13 +544,19 @@ __global__ void _batched_kalman_device_loop_large_kernel(const double* d_ys,
                                                 d_P + bid * rd2,
                                                 shared_mem.gemm_storage);
       __syncthreads();  // For consistency of P
-      // P = P + R*Q*R'
+      // tmp = P + R*Q*R'
       /// TODO: shared mem R instead of precomputed matrix?
       for (int i = threadIdx.x; i < rd2; i += GemmPolicy::BlockSize) {
-        d_P[bid * rd2 + i] += d_RQR[bid * rd2 + i];
+        d_m_tmp[bid * rd2 + i] = d_P[bid * rd2 + i] + d_RQR[bid * rd2 + i];
       }
-
-      __syncthreads();  // necessary to reuse shared memory
+      __syncthreads();
+
+      // Numerical stability: enforce symmetry of P and positivity of diagonal
+      // P = 0.5 * (tmp + tmp')
+      // Pii = abs(Pii)
+      MLCommon::LinAlg::_block_covariance_stability<CovPolicy>(
+        rd, d_m_tmp + bid * rd2, d_P + bid * rd2, shared_mem.cov_stability_storage);
+      __syncthreads();
     }
 
     /* Forecast */
@@ -529,12 +624,23 @@ __global__ void _batched_kalman_device_loop_large_kernel(const double* d_ys,
       // P = P + R*Q*R'
       /// TODO: shared mem R instead of precomputed matrix?
       for (int i = threadIdx.x; i < rd2; i += GemmPolicy::BlockSize) {
-        d_P[bid * rd2 + i] += d_RQR[bid * rd2 + i];
+        d_m_tmp[bid * rd2 + i] = d_P[bid * rd2 + i] + d_RQR[bid * rd2 + i];
       }
+
+      __syncthreads();
+      // Numerical stability: enforce symmetry of P and positivity of diagonal
+      // P = 0.5 * (tmp + tmp')
+      // Pii = abs(Pii)
+      MLCommon::LinAlg::_block_covariance_stability<CovPolicy>(
+        rd, d_m_tmp + bid * rd2, d_P + bid * rd2, shared_mem.cov_stability_storage);
     }
 
-    /* Write to global mem */
-    if (threadIdx.x == 0) d_sum_logFs[bid] = sum_logFs;
+    /* Compute log-likelihood */
+    if (threadIdx.x == 0) {
+      double n_obs_ll_f = static_cast<double>(n_obs_ll);
+      ll_s2 /= n_obs_ll_f;
+      d_loglike[bid] = -.5 * (sum_logFs + n_obs_ll_f * (ll_s2 + log(2 * M_PI)));
+    }
   }
 }
 
@@ -552,16 +658,15 @@ __global__ void _batched_kalman_device_loop_large_kernel(const double* d_ys,
  * @param[in]  intercept    Do we fit an intercept?
  * @param[in]  d_mu         Batched intercept                     (1)
  * @param[in]  rd           Dimension of the state vector
- * @param[out] d_vs         Batched residuals                     (nobs)
- * @param[out] d_Fs         Batched variance of prediction errors (nobs)
- * @param[out] d_sum_logFs  Batched sum of the logs of Fs         (1)
+ * @param[out] d_pred       Predictions                           (nobs)
+ * @param[out] d_loglike    Log-likelihood                        (1)
  * @param[in]  n_diff       d + s*D
  * @param[in]  fc_steps     Number of steps to forecast
  * @param[out] d_fc         Array to store the forecast
  * @param[in]  conf_int     Whether to compute confidence intervals
  * @param[out] d_F_fc       Batched variance of forecast errors   (fc_steps)
  */
-template <typename GemmPolicy, typename GemvPolicy>
+template <typename GemmPolicy, typename GemvPolicy, typename CovPolicy>
 void _batched_kalman_device_loop_large(const ARIMAMemory<double>& arima_mem,
                                        const double* d_ys,
                                        int n_obs,
@@ -573,9 +678,8 @@ void _batched_kalman_device_loop_large(const ARIMAMemory<double>& arima_mem,
                                        bool intercept,
                                        const double* d_mu,
                                        int rd,
-                                       double* d_vs,
-                                       double* d_Fs,
-                                       double* d_sum_logFs,
+                                       double* d_pred,
+                                       double* d_loglike,
                                        int n_diff,
                                        int fc_steps   = 0,
                                        double* d_fc   = nullptr,
@@ -584,6 +688,8 @@ void _batched_kalman_device_loop_large(const ARIMAMemory<double>& arima_mem,
 {
   static_assert(GemmPolicy::BlockSize == GemvPolicy::BlockSize,
                 "Gemm and gemv policies: block size mismatch");
+  static_assert(GemmPolicy::BlockSize == CovPolicy::BlockSize,
+                "Gemm and cov stability policies: block size mismatch");
 
   auto stream       = T.stream();
   auto cublasHandle = T.cublasHandle();
@@ -603,7 +709,7 @@ void _batched_kalman_device_loop_large(const ARIMAMemory<double>& arima_mem,
 
   int grid_size          = std::min(batch_size, 65536);
   size_t shared_mem_size = 4 * rd * sizeof(double);
-  _batched_kalman_device_loop_large_kernel<GemmPolicy, GemvPolicy>
+  _batched_kalman_device_loop_large_kernel<GemmPolicy, GemvPolicy, CovPolicy>
     <<<grid_size, GemmPolicy::BlockSize, shared_mem_size, stream>>>(d_ys,
                                                                     batch_size,
                                                                     n_obs,
@@ -617,9 +723,8 @@ void _batched_kalman_device_loop_large(const ARIMAMemory<double>& arima_mem,
                                                                     intercept,
                                                                     d_mu,
                                                                     rd,
-                                                                    d_vs,
-                                                                    d_Fs,
-                                                                    d_sum_logFs,
+                                                                    d_pred,
+                                                                    d_loglike,
                                                                     n_diff,
                                                                     fc_steps,
                                                                     d_fc,
@@ -640,9 +745,8 @@ void batched_kalman_loop(raft::handle_t& handle,
                          bool intercept,
                          const double* d_mu,
                          const ARIMAOrder& order,
-                         double* vs,
-                         double* Fs,
-                         double* sum_logFs,
+                         double* d_pred,
+                         double* d_loglike,
                          int fc_steps   = 0,
                          double* d_fc   = nullptr,
                          bool conf_int  = false,
@@ -668,9 +772,8 @@ void batched_kalman_loop(raft::handle_t& handle,
                                                          intercept,
                                                          d_mu,
                                                          batch_size,
-                                                         vs,
-                                                         Fs,
-                                                         sum_logFs,
+                                                         d_pred,
+                                                         d_loglike,
                                                          n_diff,
                                                          fc_steps,
                                                          d_fc,
@@ -689,9 +792,8 @@ void batched_kalman_loop(raft::handle_t& handle,
                                                          intercept,
                                                          d_mu,
                                                          batch_size,
-                                                         vs,
-                                                         Fs,
-                                                         sum_logFs,
+                                                         d_pred,
+                                                         d_loglike,
                                                          n_diff,
                                                          fc_steps,
                                                          d_fc,
@@ -710,9 +812,8 @@ void batched_kalman_loop(raft::handle_t& handle,
                                                          intercept,
                                                          d_mu,
                                                          batch_size,
-                                                         vs,
-                                                         Fs,
-                                                         sum_logFs,
+                                                         d_pred,
+                                                         d_loglike,
                                                          n_diff,
                                                          fc_steps,
                                                          d_fc,
@@ -731,9 +832,8 @@ void batched_kalman_loop(raft::handle_t& handle,
                                                          intercept,
                                                          d_mu,
                                                          batch_size,
-                                                         vs,
-                                                         Fs,
-                                                         sum_logFs,
+                                                         d_pred,
+                                                         d_loglike,
                                                          n_diff,
                                                          fc_steps,
                                                          d_fc,
@@ -752,9 +852,8 @@ void batched_kalman_loop(raft::handle_t& handle,
                                                          intercept,
                                                          d_mu,
                                                          batch_size,
-                                                         vs,
-                                                         Fs,
-                                                         sum_logFs,
+                                                         d_pred,
+                                                         d_loglike,
                                                          n_diff,
                                                          fc_steps,
                                                          d_fc,
@@ -773,9 +872,8 @@ void batched_kalman_loop(raft::handle_t& handle,
                                                          intercept,
                                                          d_mu,
                                                          batch_size,
-                                                         vs,
-                                                         Fs,
-                                                         sum_logFs,
+                                                         d_pred,
+                                                         d_loglike,
                                                          n_diff,
                                                          fc_steps,
                                                          d_fc,
@@ -794,9 +892,8 @@ void batched_kalman_loop(raft::handle_t& handle,
                                                          intercept,
                                                          d_mu,
                                                          batch_size,
-                                                         vs,
-                                                         Fs,
-                                                         sum_logFs,
+                                                         d_pred,
+                                                         d_loglike,
                                                          n_diff,
                                                          fc_steps,
                                                          d_fc,
@@ -815,9 +912,8 @@ void batched_kalman_loop(raft::handle_t& handle,
                                                          intercept,
                                                          d_mu,
                                                          batch_size,
-                                                         vs,
-                                                         Fs,
-                                                         sum_logFs,
+                                                         d_pred,
+                                                         d_loglike,
                                                          n_diff,
                                                          fc_steps,
                                                          d_fc,
@@ -833,177 +929,139 @@ void batched_kalman_loop(raft::handle_t& handle,
       if (batch_size <= 2 * num_sm) {
         using GemmPolicy = MLCommon::LinAlg::BlockGemmPolicy<1, 16, 1, 1, 16, 16>;
         using GemvPolicy = MLCommon::LinAlg::BlockGemvPolicy<16, 16>;
-        _batched_kalman_device_loop_large<GemmPolicy, GemvPolicy>(arima_mem,
-                                                                  ys,
-                                                                  nobs,
-                                                                  T,
-                                                                  Z,
-                                                                  RQR,
-                                                                  P0,
-                                                                  alpha,
-                                                                  intercept,
-                                                                  d_mu,
-                                                                  rd,
-                                                                  vs,
-                                                                  Fs,
-                                                                  sum_logFs,
-                                                                  n_diff,
-                                                                  fc_steps,
-                                                                  d_fc,
-                                                                  conf_int,
-                                                                  d_F_fc);
+        using CovPolicy  = MLCommon::LinAlg::BlockPolicy<1, 1, 16, 16>;
+        _batched_kalman_device_loop_large<GemmPolicy, GemvPolicy, CovPolicy>(arima_mem,
+                                                                             ys,
+                                                                             nobs,
+                                                                             T,
+                                                                             Z,
+                                                                             RQR,
+                                                                             P0,
+                                                                             alpha,
+                                                                             intercept,
+                                                                             d_mu,
+                                                                             rd,
+                                                                             d_pred,
+                                                                             d_loglike,
+                                                                             n_diff,
+                                                                             fc_steps,
+                                                                             d_fc,
+                                                                             conf_int,
+                                                                             d_F_fc);
       } else {
         using GemmPolicy = MLCommon::LinAlg::BlockGemmPolicy<1, 16, 1, 4, 16, 4>;
         using GemvPolicy = MLCommon::LinAlg::BlockGemvPolicy<16, 4>;
-        _batched_kalman_device_loop_large<GemmPolicy, GemvPolicy>(arima_mem,
-                                                                  ys,
-                                                                  nobs,
-                                                                  T,
-                                                                  Z,
-                                                                  RQR,
-                                                                  P0,
-                                                                  alpha,
-                                                                  intercept,
-                                                                  d_mu,
-                                                                  rd,
-                                                                  vs,
-                                                                  Fs,
-                                                                  sum_logFs,
-                                                                  n_diff,
-                                                                  fc_steps,
-                                                                  d_fc,
-                                                                  conf_int,
-                                                                  d_F_fc);
+        using CovPolicy  = MLCommon::LinAlg::BlockPolicy<1, 4, 16, 4>;
+        _batched_kalman_device_loop_large<GemmPolicy, GemvPolicy, CovPolicy>(arima_mem,
+                                                                             ys,
+                                                                             nobs,
+                                                                             T,
+                                                                             Z,
+                                                                             RQR,
+                                                                             P0,
+                                                                             alpha,
+                                                                             intercept,
+                                                                             d_mu,
+                                                                             rd,
+                                                                             d_pred,
+                                                                             d_loglike,
+                                                                             n_diff,
+                                                                             fc_steps,
+                                                                             d_fc,
+                                                                             conf_int,
+                                                                             d_F_fc);
       }
     } else if (rd <= 32) {
       if (batch_size <= 2 * num_sm) {
         using GemmPolicy = MLCommon::LinAlg::BlockGemmPolicy<1, 32, 1, 4, 32, 8>;
         using GemvPolicy = MLCommon::LinAlg::BlockGemvPolicy<32, 8>;
-        _batched_kalman_device_loop_large<GemmPolicy, GemvPolicy>(arima_mem,
-                                                                  ys,
-                                                                  nobs,
-                                                                  T,
-                                                                  Z,
-                                                                  RQR,
-                                                                  P0,
-                                                                  alpha,
-                                                                  intercept,
-                                                                  d_mu,
-                                                                  rd,
-                                                                  vs,
-                                                                  Fs,
-                                                                  sum_logFs,
-                                                                  n_diff,
-                                                                  fc_steps,
-                                                                  d_fc,
-                                                                  conf_int,
-                                                                  d_F_fc);
+        using CovPolicy  = MLCommon::LinAlg::BlockPolicy<1, 4, 32, 8>;
+        _batched_kalman_device_loop_large<GemmPolicy, GemvPolicy, CovPolicy>(arima_mem,
+                                                                             ys,
+                                                                             nobs,
+                                                                             T,
+                                                                             Z,
+                                                                             RQR,
+                                                                             P0,
+                                                                             alpha,
+                                                                             intercept,
+                                                                             d_mu,
+                                                                             rd,
+                                                                             d_pred,
+                                                                             d_loglike,
+                                                                             n_diff,
+                                                                             fc_steps,
+                                                                             d_fc,
+                                                                             conf_int,
+                                                                             d_F_fc);
       } else {
         using GemmPolicy = MLCommon::LinAlg::BlockGemmPolicy<1, 32, 1, 8, 32, 4>;
         using GemvPolicy = MLCommon::LinAlg::BlockGemvPolicy<32, 4>;
-        _batched_kalman_device_loop_large<GemmPolicy, GemvPolicy>(arima_mem,
-                                                                  ys,
-                                                                  nobs,
-                                                                  T,
-                                                                  Z,
-                                                                  RQR,
-                                                                  P0,
-                                                                  alpha,
-                                                                  intercept,
-                                                                  d_mu,
-                                                                  rd,
-                                                                  vs,
-                                                                  Fs,
-                                                                  sum_logFs,
-                                                                  n_diff,
-                                                                  fc_steps,
-                                                                  d_fc,
-                                                                  conf_int,
-                                                                  d_F_fc);
+        using CovPolicy  = MLCommon::LinAlg::BlockPolicy<1, 8, 32, 4>;
+        _batched_kalman_device_loop_large<GemmPolicy, GemvPolicy, CovPolicy>(arima_mem,
+                                                                             ys,
+                                                                             nobs,
+                                                                             T,
+                                                                             Z,
+                                                                             RQR,
+                                                                             P0,
+                                                                             alpha,
+                                                                             intercept,
+                                                                             d_mu,
+                                                                             rd,
+                                                                             d_pred,
+                                                                             d_loglike,
+                                                                             n_diff,
+                                                                             fc_steps,
+                                                                             d_fc,
+                                                                             conf_int,
+                                                                             d_F_fc);
       }
     } else if (rd > 64 && rd <= 128) {
       using GemmPolicy = MLCommon::LinAlg::BlockGemmPolicy<1, 16, 1, 16, 128, 2>;
       using GemvPolicy = MLCommon::LinAlg::BlockGemvPolicy<128, 2>;
-      _batched_kalman_device_loop_large<GemmPolicy, GemvPolicy>(arima_mem,
-                                                                ys,
-                                                                nobs,
-                                                                T,
-                                                                Z,
-                                                                RQR,
-                                                                P0,
-                                                                alpha,
-                                                                intercept,
-                                                                d_mu,
-                                                                rd,
-                                                                vs,
-                                                                Fs,
-                                                                sum_logFs,
-                                                                n_diff,
-                                                                fc_steps,
-                                                                d_fc,
-                                                                conf_int,
-                                                                d_F_fc);
+      using CovPolicy  = MLCommon::LinAlg::BlockPolicy<1, 8, 64, 4>;
+      _batched_kalman_device_loop_large<GemmPolicy, GemvPolicy, CovPolicy>(arima_mem,
+                                                                           ys,
+                                                                           nobs,
+                                                                           T,
+                                                                           Z,
+                                                                           RQR,
+                                                                           P0,
+                                                                           alpha,
+                                                                           intercept,
+                                                                           d_mu,
+                                                                           rd,
+                                                                           d_pred,
+                                                                           d_loglike,
+                                                                           n_diff,
+                                                                           fc_steps,
+                                                                           d_fc,
+                                                                           conf_int,
+                                                                           d_F_fc);
     } else {
       using GemmPolicy = MLCommon::LinAlg::BlockGemmPolicy<1, 32, 1, 16, 64, 4>;
       using GemvPolicy = MLCommon::LinAlg::BlockGemvPolicy<64, 4>;
-      _batched_kalman_device_loop_large<GemmPolicy, GemvPolicy>(arima_mem,
-                                                                ys,
-                                                                nobs,
-                                                                T,
-                                                                Z,
-                                                                RQR,
-                                                                P0,
-                                                                alpha,
-                                                                intercept,
-                                                                d_mu,
-                                                                rd,
-                                                                vs,
-                                                                Fs,
-                                                                sum_logFs,
-                                                                n_diff,
-                                                                fc_steps,
-                                                                d_fc,
-                                                                conf_int,
-                                                                d_F_fc);
-    }
-  }
-}
-
-template <int NUM_THREADS>
-__global__ void batched_kalman_loglike_kernel(const double* d_vs,
-                                              const double* d_Fs,
-                                              const double* d_sumLogFs,
-                                              int nobs,
-                                              int batch_size,
-                                              double* d_loglike,
-                                              double* d_sigma2,
-                                              int n_diff,
-                                              double level)
-{
-  using BlockReduce = cub::BlockReduce<double, NUM_THREADS>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  int tid           = threadIdx.x;
-  int bid           = blockIdx.x;
-  double bid_sigma2 = 0.0;
-  for (int it = 0; it < nobs; it += NUM_THREADS) {
-    // vs and Fs are in time-major order (memory layout: column major)
-    int idx         = (it + tid) + bid * nobs;
-    double d_vs2_Fs = 0.0;
-    if (it + tid >= n_diff && it + tid < nobs) {
-      double _vi = d_vs[idx];
-      d_vs2_Fs   = _vi * _vi / d_Fs[idx];
+      using CovPolicy  = MLCommon::LinAlg::BlockPolicy<1, 16, 64, 4>;
+      _batched_kalman_device_loop_large<GemmPolicy, GemvPolicy, CovPolicy>(arima_mem,
+                                                                           ys,
+                                                                           nobs,
+                                                                           T,
+                                                                           Z,
+                                                                           RQR,
+                                                                           P0,
+                                                                           alpha,
+                                                                           intercept,
+                                                                           d_mu,
+                                                                           rd,
+                                                                           d_pred,
+                                                                           d_loglike,
+                                                                           n_diff,
+                                                                           fc_steps,
+                                                                           d_fc,
+                                                                           conf_int,
+                                                                           d_F_fc);
     }
-    __syncthreads();
-    double partial_sum = BlockReduce(temp_storage).Sum(d_vs2_Fs, nobs - it);
-    bid_sigma2 += partial_sum;
-  }
-  if (tid == 0) {
-    double nobs_diff_f = static_cast<double>(nobs - n_diff);
-    bid_sigma2 /= nobs_diff_f;
-    if (level != 0) d_sigma2[bid] = bid_sigma2;
-    d_loglike[bid] =
-      -.5 * (d_sumLogFs[bid] + nobs_diff_f * bid_sigma2 + nobs_diff_f * (log(2 * M_PI)));
   }
 }
 
@@ -1013,25 +1071,21 @@ __global__ void batched_kalman_loglike_kernel(const double* d_vs,
  * @note: One block per batch member, one thread per forecast time step
  *
  * @param[in]    d_fc       Mean forecasts
- * @param[in]    d_sigma2   sum(v_t * v_t / F_t) / n_obs_diff
  * @param[inout] d_lower    Input: F_{n+t}
  *                          Output: lower bound of the confidence intervals
  * @param[out]   d_upper    Upper bound of the confidence intervals
- * @param[in]    fc_steps   Number of forecast steps
+ * @param[in]    n_elem     Total number of elements (fc_steps * batch_size)
  * @param[in]    multiplier Coefficient associated with the confidence level
  */
-__global__ void confidence_intervals(const double* d_fc,
-                                     const double* d_sigma2,
-                                     double* d_lower,
-                                     double* d_upper,
-                                     int fc_steps,
-                                     double multiplier)
+__global__ void confidence_intervals(
+  const double* d_fc, double* d_lower, double* d_upper, int n_elem, double multiplier)
 {
-  int idx       = blockIdx.x * fc_steps + threadIdx.x;
-  double fc     = d_fc[idx];
-  double margin = multiplier * sqrt(d_lower[idx] * d_sigma2[blockIdx.x]);
-  d_lower[idx]  = fc - margin;
-  d_upper[idx]  = fc + margin;
+  for (int idx = threadIdx.x; idx < n_elem; idx += blockDim.x * gridDim.x) {
+    double fc     = d_fc[idx];
+    double margin = multiplier * sqrt(d_lower[idx]);
+    d_lower[idx]  = fc - margin;
+    d_upper[idx]  = fc + margin;
+  }
 }
 
 void _lyapunov_wrapper(raft::handle_t& handle,
@@ -1086,8 +1140,7 @@ void _batched_kalman_filter(raft::handle_t& handle,
                             const MLCommon::LinAlg::Batched::Matrix<double>& Zb,
                             const MLCommon::LinAlg::Batched::Matrix<double>& Tb,
                             const MLCommon::LinAlg::Batched::Matrix<double>& Rb,
-                            double* d_vs,
-                            double* d_Fs,
+                            double* d_pred,
                             double* d_loglike,
                             const double* d_sigma2,
                             bool intercept,
@@ -1253,30 +1306,18 @@ void _batched_kalman_filter(raft::handle_t& handle,
                       intercept,
                       d_mu,
                       order,
-                      d_vs,
-                      d_Fs,
-                      arima_mem.sumLogF_buffer,
+                      d_pred,
+                      d_loglike,
                       fc_steps,
                       d_fc,
                       level > 0,
                       d_lower);
 
-  // Finalize loglikelihood and prediction intervals
-  constexpr int NUM_THREADS = 128;
-  batched_kalman_loglike_kernel<NUM_THREADS>
-    <<<batch_size, NUM_THREADS, 0, stream>>>(d_vs,
-                                             d_Fs,
-                                             arima_mem.sumLogF_buffer,
-                                             nobs,
-                                             batch_size,
-                                             d_loglike,
-                                             arima_mem.sigma2_buffer,
-                                             n_diff,
-                                             level);
-  CUDA_CHECK(cudaPeekAtLastError());
   if (level > 0) {
-    confidence_intervals<<<batch_size, fc_steps, 0, stream>>>(
-      d_fc, arima_mem.sigma2_buffer, d_lower, d_upper, fc_steps, sqrt(2.0) * erfinv(level));
+    constexpr int TPB_conf = 256;
+    int n_blocks           = raft::ceildiv<int>(fc_steps * batch_size, TPB_conf);
+    confidence_intervals<<<n_blocks, TPB_conf, 0, stream>>>(
+      d_fc, d_lower, d_upper, fc_steps * batch_size, sqrt(2.0) * erfinv(level));
     CUDA_CHECK(cudaPeekAtLastError());
   }
 }
@@ -1398,7 +1439,7 @@ void batched_kalman_filter(raft::handle_t& handle,
                            const ARIMAOrder& order,
                            int batch_size,
                            double* d_loglike,
-                           double* d_vs,
+                           double* d_pred,
                            int fc_steps,
                            double* d_fc,
                            double level,
@@ -1443,8 +1484,7 @@ void batched_kalman_filter(raft::handle_t& handle,
                          Zb,
                          Tb,
                          Rb,
-                         d_vs,
-                         arima_mem.F_buffer,
+                         d_pred,
                          d_loglike,
                          params.sigma2,
                          static_cast<bool>(order.k),
diff --git a/cpp/src/dbscan/adjgraph/naive.cuh b/cpp/src/dbscan/adjgraph/naive.cuh
index 6ef2830c7d..f633a2e4c8 100644
--- a/cpp/src/dbscan/adjgraph/naive.cuh
+++ b/cpp/src/dbscan/adjgraph/naive.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
+#include <cuml/common/pinned_host_vector.hpp>
 #include <raft/cuda_utils.cuh>
 #include <vector>
 #include "../common.cuh"
@@ -35,14 +36,14 @@ void launcher(const raft::handle_t& handle,
 {
   Index_ k = 0;
   Index_ N = data.N;
-  std::vector<Index_> host_vd(batch_size + 1);
-  std::vector<char> host_adj(((batch_size * N) / 8) + 1);
-  std::vector<Index_> host_ex_scan(batch_size);
+  ML::pinned_host_vector<Index_> host_vd(batch_size + 1);
+  ML::pinned_host_vector<char> host_adj(((batch_size * N) / 8) + 1);
+  ML::pinned_host_vector<Index_> host_ex_scan(batch_size);
   raft::update_host((bool*)host_adj.data(), data.adj, batch_size * N, stream);
   raft::update_host(host_vd.data(), data.vd, batch_size + 1, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
   size_t adjgraph_size = size_t(host_vd[batch_size]);
-  std::vector<Index_> host_adj_graph(adjgraph_size);
+  ML::pinned_host_vector<Index_> host_adj_graph(adjgraph_size);
   for (Index_ i = 0; i < batch_size; i++) {
     for (Index_ j = 0; j < N; j++) {
       /// TODO: change layout or remove; cf #3414
diff --git a/cpp/src/dbscan/dbscan.cuh b/cpp/src/dbscan/dbscan.cuh
index 647420db1b..9d7c1061cc 100644
--- a/cpp/src/dbscan/dbscan.cuh
+++ b/cpp/src/dbscan/dbscan.cuh
@@ -60,6 +60,7 @@ size_t compute_batch_size(size_t& estimated_memory,
   // from the over-estimation of the sparse adjacency matrix
 
   // Batch size determined based on available memory
+  ASSERT(est_mem_per_row > 0, "Estimated memory per row is 0 for DBSCAN");
   size_t batch_size = (max_mbytes_per_batch * 1000000 - est_mem_fixed) / est_mem_per_row;
 
   // Limit batch size to number of owned rows
@@ -118,6 +119,8 @@ void dbscanFitImpl(const raft::handle_t& handle,
   Index_ start_row{0};
   Index_ n_owned_rows{n_rows};
 
+  ASSERT(n_rows > 0, "No rows in the input array. DBSCAN cannot be fitted!");
+
   if (opg) {
     const auto& comm     = handle.get_comms();
     my_rank              = comm.get_rank();
diff --git a/cpp/src/decisiontree/batched-levelalgo/builder.cuh b/cpp/src/decisiontree/batched-levelalgo/builder.cuh
index 8dc897f414..68529b9c44 100644
--- a/cpp/src/decisiontree/batched-levelalgo/builder.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/builder.cuh
@@ -27,6 +27,7 @@
 #include <common/grid_sync.cuh>
 #include <common/nvtx.hpp>
 #include <cuml/common/logger.hpp>
+#include <cuml/common/pinned_host_vector.hpp>
 #include <cuml/tree/decisiontree.hpp>
 #include <raft/cuda_utils.cuh>
 #include "input.cuh"
@@ -50,11 +51,12 @@ class NodeQueue {
   std::deque<NodeWorkItem> work_items_;
 
  public:
-  NodeQueue(DecisionTreeParams params, size_t max_nodes, size_t sampled_rows)
+  NodeQueue(DecisionTreeParams params, size_t max_nodes, size_t sampled_rows, int num_outputs)
     : params(params), tree(std::make_shared<DT::TreeMetaDataNode<DataT, LabelT>>())
   {
+    tree->num_outputs = num_outputs;
     tree->sparsetree.reserve(max_nodes);
-    tree->sparsetree.emplace_back(NodeT::CreateLeafNode(0, sampled_rows));
+    tree->sparsetree.emplace_back(NodeT::CreateLeafNode(sampled_rows));
     tree->leaf_counter  = 1;
     tree->depth_counter = 0;
     node_instances_.reserve(max_nodes);
@@ -112,7 +114,7 @@ class NodeQueue {
                                                              parent_range.count);
       tree->leaf_counter++;
       // left
-      tree->sparsetree.emplace_back(NodeT::CreateLeafNode(0, split.nLeft));
+      tree->sparsetree.emplace_back(NodeT::CreateLeafNode(split.nLeft));
       node_instances_.emplace_back(InstanceRange{parent_range.begin, std::size_t(split.nLeft)});
 
       // Do not add a work item if this child is definitely a leaf
@@ -122,7 +124,7 @@ class NodeQueue {
       }
 
       // right
-      tree->sparsetree.emplace_back(NodeT::CreateLeafNode(0, parent_range.count - split.nLeft));
+      tree->sparsetree.emplace_back(NodeT::CreateLeafNode(parent_range.count - split.nLeft));
       node_instances_.emplace_back(
         InstanceRange{parent_range.begin + split.nLeft, parent_range.count - split.nLeft});
 
@@ -154,6 +156,7 @@ struct Builder {
   /** default threads per block for most kernels in here */
   static constexpr int TPB_DEFAULT = 128;
   const raft::handle_t& handle;
+  cudaStream_t builder_stream;
   /** DT params */
   DecisionTreeParams params;
   /** input dataset */
@@ -189,9 +192,10 @@ struct Builder {
   const size_t alignValue = 512;
 
   rmm::device_uvector<char> d_buff;
-  std::vector<char> h_buff;
+  ML::pinned_host_vector<char> h_buff;
 
   Builder(const raft::handle_t& handle,
+          cudaStream_t s,
           IdxT treeid,
           uint64_t seed,
           const DecisionTreeParams& p,
@@ -203,6 +207,7 @@ struct Builder {
           IdxT nclasses,
           std::shared_ptr<const rmm::device_uvector<DataT>> quantiles)
     : handle(handle),
+      builder_stream(s),
       treeid(treeid),
       seed(seed),
       params(p),
@@ -216,14 +221,14 @@ struct Builder {
             rowids->data(),
             nclasses,
             quantiles->data()},
-      d_buff(0, handle.get_stream())
+      d_buff(0, builder_stream)
   {
     max_blocks = 1 + params.max_batch_size + input.nSampledRows / TPB_DEFAULT;
     ASSERT(quantiles != nullptr, "Currently quantiles need to be computed before this call!");
     ASSERT(nclasses >= 1, "nclasses should be at least 1");
 
     auto [device_workspace_size, host_workspace_size] = workspaceSize();
-    d_buff.resize(device_workspace_size, handle.get_stream());
+    d_buff.resize(device_workspace_size, builder_stream);
     h_buff.resize(host_workspace_size);
     assignWorkspace(d_buff.data(), h_buff.data());
   }
@@ -299,8 +304,8 @@ struct Builder {
     d_wspace += calculateAlignedBytes(sizeof(WorkloadInfo<IdxT>) * max_blocks);
 
     CUDA_CHECK(
-      cudaMemsetAsync(done_count, 0, sizeof(int) * max_batch * n_col_blks, handle.get_stream()));
-    CUDA_CHECK(cudaMemsetAsync(mutex, 0, sizeof(int) * max_batch, handle.get_stream()));
+      cudaMemsetAsync(done_count, 0, sizeof(int) * max_batch * n_col_blks, builder_stream));
+    CUDA_CHECK(cudaMemsetAsync(mutex, 0, sizeof(int) * max_batch, builder_stream));
 
     // host
     h_workload_info = reinterpret_cast<WorkloadInfo<IdxT>*>(h_wspace);
@@ -314,14 +319,14 @@ struct Builder {
   {
     ML::PUSH_RANGE("Builder::train @builder.cuh [batched-levelalgo]");
     MLCommon::TimerCPU timer;
-    NodeQueue<DataT, LabelT> queue(params, this->maxNodes(), input.nSampledRows);
+    NodeQueue<DataT, LabelT> queue(params, this->maxNodes(), input.nSampledRows, input.numOutputs);
     while (queue.HasWork()) {
       auto work_items                      = queue.Pop();
       auto [splits_host_ptr, splits_count] = doSplit(work_items);
       queue.Push(work_items, splits_host_ptr);
     }
     auto tree = queue.GetTree();
-    this->SetLeafPredictions(&tree->sparsetree, queue.GetInstanceRanges());
+    this->SetLeafPredictions(tree, queue.GetInstanceRanges());
     tree->train_time = timer.getElapsedMilliseconds();
     ML::POP_RANGE();
     return tree;
@@ -347,7 +352,7 @@ struct Builder {
       }
       total_num_blocks += num_blocks;
     }
-    raft::update_device(workload_info, h_workload_info, total_num_blocks, handle.get_stream());
+    raft::update_device(workload_info, h_workload_info, total_num_blocks, builder_stream);
     return std::make_pair(total_num_blocks, n_large_nodes_in_curr_batch);
   }
 
@@ -355,11 +360,11 @@ struct Builder {
   {
     ML::PUSH_RANGE("Builder::doSplit @bulder_base.cuh [batched-levelalgo]");
     // start fresh on the number of *new* nodes created in this batch
-    CUDA_CHECK(cudaMemsetAsync(n_nodes, 0, sizeof(IdxT), handle.get_stream()));
-    initSplit<DataT, IdxT, TPB_DEFAULT>(splits, work_items.size(), handle.get_stream());
+    CUDA_CHECK(cudaMemsetAsync(n_nodes, 0, sizeof(IdxT), builder_stream));
+    initSplit<DataT, IdxT, TPB_DEFAULT>(splits, work_items.size(), builder_stream);
 
     // get the current set of nodes to be worked upon
-    raft::update_device(d_work_items, work_items.data(), work_items.size(), handle.get_stream());
+    raft::update_device(d_work_items, work_items.data(), work_items.size(), builder_stream);
 
     auto [total_blocks, large_blocks] = this->updateWorkloadInfo(work_items);
 
@@ -374,19 +379,18 @@ struct Builder {
     auto smemSize = 2 * sizeof(IdxT) * TPB_DEFAULT;
     ML::PUSH_RANGE("nodeSplitKernel @builder_base.cuh [batched-levelalgo]");
     nodeSplitKernel<DataT, LabelT, IdxT, ObjectiveT, TPB_DEFAULT>
-      <<<work_items.size(), TPB_DEFAULT, smemSize, handle.get_stream()>>>(
-        params.max_depth,
-        params.min_samples_leaf,
-        params.min_samples_split,
-        params.max_leaves,
-        params.min_impurity_decrease,
-        input,
-        d_work_items,
-        splits);
+      <<<work_items.size(), TPB_DEFAULT, smemSize, builder_stream>>>(params.max_depth,
+                                                                     params.min_samples_leaf,
+                                                                     params.min_samples_split,
+                                                                     params.max_leaves,
+                                                                     params.min_impurity_decrease,
+                                                                     input,
+                                                                     d_work_items,
+                                                                     splits);
     CUDA_CHECK(cudaGetLastError());
     ML::POP_RANGE();
-    raft::update_host(h_splits, splits, work_items.size(), handle.get_stream());
-    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
+    raft::update_host(h_splits, splits, work_items.size(), builder_stream);
+    CUDA_CHECK(cudaStreamSynchronize(builder_stream));
     ML::POP_RANGE();
     return std::make_tuple(h_splits, work_items.size());
   }
@@ -419,54 +423,63 @@ struct Builder {
     auto smemSize = computeSplitSmemSize();
     dim3 grid(total_blocks, colBlks, 1);
     int nHistBins = large_blocks * nbins * colBlks * nclasses;
-    CUDA_CHECK(cudaMemsetAsync(hist, 0, sizeof(BinT) * nHistBins, handle.get_stream()));
+    CUDA_CHECK(cudaMemsetAsync(hist, 0, sizeof(BinT) * nHistBins, builder_stream));
     ML::PUSH_RANGE("computeSplitClassificationKernel @builder_base.cuh [batched-levelalgo]");
-    ObjectiveT objective(input.numOutputs, params.min_impurity_decrease, params.min_samples_leaf);
+    ObjectiveT objective(input.numOutputs, params.min_samples_leaf);
     computeSplitKernel<DataT, LabelT, IdxT, TPB_DEFAULT>
-      <<<grid, TPB_DEFAULT, smemSize, handle.get_stream()>>>(hist,
-                                                             params.n_bins,
-                                                             params.max_depth,
-                                                             params.min_samples_split,
-                                                             params.max_leaves,
-                                                             input,
-                                                             d_work_items,
-                                                             col,
-                                                             done_count,
-                                                             mutex,
-                                                             splits,
-                                                             objective,
-                                                             treeid,
-                                                             workload_info,
-                                                             seed);
+      <<<grid, TPB_DEFAULT, smemSize, builder_stream>>>(hist,
+                                                        params.n_bins,
+                                                        params.max_depth,
+                                                        params.min_samples_split,
+                                                        params.max_leaves,
+                                                        input,
+                                                        d_work_items,
+                                                        col,
+                                                        done_count,
+                                                        mutex,
+                                                        splits,
+                                                        objective,
+                                                        treeid,
+                                                        workload_info,
+                                                        seed);
     ML::POP_RANGE();  // computeSplitKernel
     ML::POP_RANGE();  // Builder::computeSplit
   }
 
   // Set the leaf value predictions in batch
-  void SetLeafPredictions(std::vector<NodeT>* tree,
+  void SetLeafPredictions(std::shared_ptr<DT::TreeMetaDataNode<DataT, LabelT>> tree,
                           const std::vector<InstanceRange>& instance_ranges)
   {
+    tree->vector_leaf.resize(tree->sparsetree.size() * input.numOutputs);
+    ASSERT(tree->sparsetree.size() == instance_ranges.size(),
+           "Expected instance range for each node");
     // do this in batch to reduce peak memory usage in extreme cases
-    std::size_t max_batch_size = min(std::size_t(100000), tree->size());
-    rmm::device_uvector<NodeT> d_tree(max_batch_size, handle.get_stream());
-    rmm::device_uvector<InstanceRange> d_instance_ranges(max_batch_size, handle.get_stream());
-    ObjectiveT objective(input.numOutputs, params.min_impurity_decrease, params.min_samples_leaf);
-    for (std::size_t batch_begin = 0; batch_begin < tree->size(); batch_begin += max_batch_size) {
-      std::size_t batch_end  = min(batch_begin + max_batch_size, tree->size());
+    std::size_t max_batch_size = min(std::size_t(100000), tree->sparsetree.size());
+    rmm::device_uvector<NodeT> d_tree(max_batch_size, builder_stream);
+    rmm::device_uvector<InstanceRange> d_instance_ranges(max_batch_size, builder_stream);
+    rmm::device_uvector<DataT> d_leaves(max_batch_size * input.numOutputs, builder_stream);
+
+    ObjectiveT objective(input.numOutputs, params.min_samples_leaf);
+    for (std::size_t batch_begin = 0; batch_begin < tree->sparsetree.size();
+         batch_begin += max_batch_size) {
+      std::size_t batch_end  = min(batch_begin + max_batch_size, tree->sparsetree.size());
       std::size_t batch_size = batch_end - batch_begin;
       raft::update_device(
-        d_tree.data(), tree->data() + batch_begin, batch_size, handle.get_stream());
-      raft::update_device(d_instance_ranges.data(),
-                          instance_ranges.data() + batch_begin,
-                          batch_size,
-                          handle.get_stream());
+        d_tree.data(), tree->sparsetree.data() + batch_begin, batch_size, builder_stream);
+      raft::update_device(
+        d_instance_ranges.data(), instance_ranges.data() + batch_begin, batch_size, builder_stream);
+
+      CUDA_CHECK(
+        cudaMemsetAsync(d_leaves.data(), 0, sizeof(DataT) * d_leaves.size(), builder_stream));
       size_t smemSize = sizeof(BinT) * input.numOutputs;
       int num_blocks  = batch_size;
-      leafKernel<<<num_blocks, TPB_DEFAULT, smemSize, handle.get_stream()>>>(
-        objective, input, d_tree.data(), d_instance_ranges.data());
-      raft::update_host(tree->data() + batch_begin, d_tree.data(), batch_size, handle.get_stream());
+      leafKernel<<<num_blocks, TPB_DEFAULT, smemSize, builder_stream>>>(
+        objective, input, d_tree.data(), d_instance_ranges.data(), d_leaves.data());
+      raft::update_host(tree->vector_leaf.data() + batch_begin * input.numOutputs,
+                        d_leaves.data(),
+                        batch_size * input.numOutputs,
+                        builder_stream);
     }
-    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 };  // end Builder
 
diff --git a/cpp/src/decisiontree/batched-levelalgo/kernels.cuh b/cpp/src/decisiontree/batched-levelalgo/kernels.cuh
index a97a11023e..0010b7d086 100644
--- a/cpp/src/decisiontree/batched-levelalgo/kernels.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/kernels.cuh
@@ -123,7 +123,6 @@ DI void partitionSamples(const Input<DataT, LabelT, IdxT>& input,
     }
   }
 }
-
 template <typename DataT, typename LabelT, typename IdxT, typename ObjectiveT, int TPB>
 __global__ void nodeSplitKernel(IdxT max_depth,
                                 IdxT min_samples_leaf,
@@ -144,17 +143,19 @@ __global__ void nodeSplitKernel(IdxT max_depth,
   partitionSamples<DataT, LabelT, IdxT, TPB>(input, split, work_item, (char*)smem);
 }
 
-template <typename InputT, typename NodeT, typename ObjectiveT>
+template <typename InputT, typename NodeT, typename ObjectiveT, typename DataT>
 __global__ void leafKernel(ObjectiveT objective,
                            InputT input,
-                           NodeT* tree,
-                           const InstanceRange* instance_ranges)
+                           const NodeT* tree,
+                           const InstanceRange* instance_ranges,
+                           DataT* leaves)
 {
   using BinT = typename ObjectiveT::BinT;
   extern __shared__ char shared_memory[];
   auto histogram = reinterpret_cast<BinT*>(shared_memory);
-  auto& node     = tree[blockIdx.x];
-  auto range     = instance_ranges[blockIdx.x];
+  auto node_id   = blockIdx.x;
+  auto& node     = tree[node_id];
+  auto range     = instance_ranges[node_id];
   if (!node.IsLeaf()) return;
   auto tid = threadIdx.x;
   for (int i = tid; i < input.numOutputs; i += blockDim.x) {
@@ -167,8 +168,7 @@ __global__ void leafKernel(ObjectiveT objective,
   }
   __syncthreads();
   if (tid == 0) {
-    node =
-      NodeT::CreateLeafNode(ObjectiveT::LeafPrediction(histogram, input.numOutputs), range.count);
+    ObjectiveT::SetLeafVector(histogram, input.numOutputs, leaves + input.numOutputs * node_id);
   }
 }
 
@@ -338,8 +338,8 @@ __global__ void computeSplitKernel(BinT* hist,
   if (input.nSampledCols == input.N) {
     col = colStart + blockIdx.y;
   } else {
-    int colIndex = colStart + blockIdx.y;
-    col          = select(colIndex, treeid, work_item.idx, seed, input.N);
+    IdxT colIndex = colStart + blockIdx.y;
+    col           = select(colIndex, treeid, work_item.idx, seed, input.N);
   }
 
   // populating shared memory with initial values
diff --git a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh
index 3e3ec252a2..d35ab238ed 100644
--- a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh
@@ -26,21 +26,54 @@
 namespace ML {
 namespace DT {
 
-struct IntBin {
+struct CountBin {
   int x;
+  CountBin(CountBin const&) = default;
+  HDI CountBin(int x_) : x(x_) {}
+  HDI CountBin() : x(0) {}
 
-  DI static void IncrementHistogram(IntBin* hist, int nbins, int b, int label)
+  DI static void IncrementHistogram(CountBin* hist, int nbins, int b, int label)
   {
     auto offset = label * nbins + b;
-    IntBin::AtomicAdd(hist + offset, {1});
+    CountBin::AtomicAdd(hist + offset, {1});
   }
-  DI static void AtomicAdd(IntBin* address, IntBin val) { atomicAdd(&address->x, val.x); }
-  DI IntBin& operator+=(const IntBin& b)
+  DI static void AtomicAdd(CountBin* address, CountBin val) { atomicAdd(&address->x, val.x); }
+  HDI CountBin& operator+=(const CountBin& b)
   {
     x += b.x;
     return *this;
   }
-  DI IntBin operator+(IntBin b) const
+  HDI CountBin operator+(CountBin b) const
+  {
+    b += *this;
+    return b;
+  }
+};
+
+struct AggregateBin {
+  double label_sum;
+  int count;
+
+  AggregateBin(AggregateBin const&) = default;
+  HDI AggregateBin() : label_sum(0.0), count(0) {}
+  HDI AggregateBin(double label_sum, int count) : label_sum(label_sum), count(count) {}
+
+  DI static void IncrementHistogram(AggregateBin* hist, int nbins, int b, double label)
+  {
+    AggregateBin::AtomicAdd(hist + b, {label, 1});
+  }
+  DI static void AtomicAdd(AggregateBin* address, AggregateBin val)
+  {
+    atomicAdd(&address->label_sum, val.label_sum);
+    atomicAdd(&address->count, val.count);
+  }
+  HDI AggregateBin& operator+=(const AggregateBin& b)
+  {
+    label_sum += b.label_sum;
+    count += b.count;
+    return *this;
+  }
+  HDI AggregateBin operator+(AggregateBin b) const
   {
     b += *this;
     return b;
@@ -53,71 +86,78 @@ class GiniObjectiveFunction {
   using DataT  = DataT_;
   using LabelT = LabelT_;
   using IdxT   = IdxT_;
+
+ private:
   IdxT nclasses;
-  DataT min_impurity_decrease;
   IdxT min_samples_leaf;
 
  public:
-  using BinT = IntBin;
-  GiniObjectiveFunction(IdxT nclasses, DataT min_impurity_decrease, IdxT min_samples_leaf)
-    : nclasses(nclasses),
-      min_impurity_decrease(min_impurity_decrease),
-      min_samples_leaf(min_samples_leaf)
+  using BinT = CountBin;
+  GiniObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf)
+    : nclasses(nclasses), min_samples_leaf(min_samples_leaf)
   {
   }
 
   DI IdxT NumClasses() const { return nclasses; }
-  DI Split<DataT, IdxT> Gain(BinT* scdf_labels, DataT* sbins, IdxT col, IdxT len, IdxT nbins)
+
+  /**
+   * @brief compute the gini impurity reduction for each split
+   */
+  HDI DataT GainPerSplit(BinT* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft)
   {
-    Split<DataT, IdxT> sp;
+    IdxT nRight         = len - nLeft;
     constexpr DataT One = DataT(1.0);
-    DataT invlen        = One / len;
+    auto invLen         = One / len;
+    auto invLeft        = One / nLeft;
+    auto invRight       = One / nRight;
+    auto gain           = DataT(0.0);
+
+    // if there aren't enough samples in this split, don't bother!
+    if (nLeft < min_samples_leaf || nRight < min_samples_leaf)
+      return -std::numeric_limits<DataT>::max();
+
+    for (IdxT j = 0; j < nclasses; ++j) {
+      int val_i   = 0;
+      auto lval_i = hist[nbins * j + i].x;
+      auto lval   = DataT(lval_i);
+      gain += lval * invLeft * lval * invLen;
+
+      val_i += lval_i;
+      auto total_sum = hist[nbins * j + nbins - 1].x;
+      auto rval_i    = total_sum - lval_i;
+      auto rval      = DataT(rval_i);
+      gain += rval * invRight * rval * invLen;
+
+      val_i += rval_i;
+      auto val = DataT(val_i) * invLen;
+      gain -= val * val;
+    }
+
+    return gain;
+  }
+
+  DI Split<DataT, IdxT> Gain(BinT* shist, DataT* sbins, IdxT col, IdxT len, IdxT nbins)
+  {
+    Split<DataT, IdxT> sp;
     for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) {
-      int nLeft = 0;
+      IdxT nLeft = 0;
       for (IdxT j = 0; j < nclasses; ++j) {
-        nLeft += scdf_labels[nbins * j + i].x;
+        nLeft += shist[nbins * j + i].x;
       }
-      auto nRight = len - nLeft;
-      auto gain   = DataT(0.0);
-      // if there aren't enough samples in this split, don't bother!
-      if (nLeft < min_samples_leaf || nRight < min_samples_leaf) {
-        gain = -std::numeric_limits<DataT>::max();
-      } else {
-        auto invLeft  = One / nLeft;
-        auto invRight = One / nRight;
-        for (IdxT j = 0; j < nclasses; ++j) {
-          int val_i   = 0;
-          auto lval_i = scdf_labels[nbins * j + i].x;
-          auto lval   = DataT(lval_i);
-          gain += lval * invLeft * lval * invlen;
-
-          val_i += lval_i;
-          auto total_sum = scdf_labels[nbins * j + nbins - 1].x;
-          auto rval_i    = total_sum - lval_i;
-          auto rval      = DataT(rval_i);
-          gain += rval * invRight * rval * invlen;
-
-          val_i += rval_i;
-          auto val = DataT(val_i) * invlen;
-          gain -= val * val;
-        }
-      }
-      sp.update({sbins[i], col, gain, nLeft});
+      sp.update({sbins[i], col, GainPerSplit(shist, i, nbins, len, nLeft), nLeft});
     }
     return sp;
   }
-  static DI LabelT LeafPrediction(BinT* shist, int nclasses)
+  static DI void SetLeafVector(BinT const* shist, int nclasses, DataT* out)
   {
-    int class_idx = 0;
-    int count     = 0;
+    // Output probability
+    int total = 0;
     for (int i = 0; i < nclasses; i++) {
-      auto current_count = shist[i].x;
-      if (current_count > count) {
-        class_idx = i;
-        count     = current_count;
-      }
+      total += shist[i].x;
+    }
+    for (int i = 0; i < nclasses; i++) {
+      out[i] = DataT(shist[i].x) / total;
     }
-    return class_idx;
   }
 };
 
@@ -127,68 +167,82 @@ class EntropyObjectiveFunction {
   using DataT  = DataT_;
   using LabelT = LabelT_;
   using IdxT   = IdxT_;
+
+ private:
   IdxT nclasses;
-  DataT min_impurity_decrease;
   IdxT min_samples_leaf;
 
  public:
-  using BinT = IntBin;
-  EntropyObjectiveFunction(IdxT nclasses, DataT min_impurity_decrease, IdxT min_samples_leaf)
-    : nclasses(nclasses),
-      min_impurity_decrease(min_impurity_decrease),
-      min_samples_leaf(min_samples_leaf)
+  using BinT = CountBin;
+  EntropyObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf)
+    : nclasses(nclasses), min_samples_leaf(min_samples_leaf)
   {
   }
   DI IdxT NumClasses() const { return nclasses; }
+
+  /**
+   * @brief compute the Entropy (or information gain) for each split
+   */
+  HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft)
+  {
+    IdxT nRight{len - nLeft};
+    auto gain{DataT(0.0)};
+    // if there aren't enough samples in this split, don't bother!
+    if (nLeft < min_samples_leaf || nRight < min_samples_leaf) {
+      return -std::numeric_limits<DataT>::max();
+    } else {
+      auto invLeft{DataT(1.0) / nLeft};
+      auto invRight{DataT(1.0) / nRight};
+      auto invLen{DataT(1.0) / len};
+      for (IdxT c = 0; c < nclasses; ++c) {
+        int val_i   = 0;
+        auto lval_i = hist[nbins * c + i].x;
+        if (lval_i != 0) {
+          auto lval = DataT(lval_i);
+          gain += raft::myLog(lval * invLeft) / raft::myLog(DataT(2)) * lval * invLen;
+        }
+
+        val_i += lval_i;
+        auto total_sum = hist[nbins * c + nbins - 1].x;
+        auto rval_i    = total_sum - lval_i;
+        if (rval_i != 0) {
+          auto rval = DataT(rval_i);
+          gain += raft::myLog(rval * invRight) / raft::myLog(DataT(2)) * rval * invLen;
+        }
+
+        val_i += rval_i;
+        if (val_i != 0) {
+          auto val = DataT(val_i) * invLen;
+          gain -= val * raft::myLog(val) / raft::myLog(DataT(2));
+        }
+      }
+
+      return gain;
+    }
+  }
+
   DI Split<DataT, IdxT> Gain(BinT* scdf_labels, DataT* sbins, IdxT col, IdxT len, IdxT nbins)
   {
     Split<DataT, IdxT> sp;
-    constexpr DataT One = DataT(1.0);
-    DataT invlen        = One / len;
     for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) {
-      int nLeft = 0;
+      IdxT nLeft = 0;
       for (IdxT j = 0; j < nclasses; ++j) {
         nLeft += scdf_labels[nbins * j + i].x;
       }
-      auto nRight = len - nLeft;
-      auto gain   = DataT(0.0);
-      // if there aren't enough samples in this split, don't bother!
-      if (nLeft < min_samples_leaf || nRight < min_samples_leaf) {
-        gain = -std::numeric_limits<DataT>::max();
-      } else {
-        auto invLeft  = One / nLeft;
-        auto invRight = One / nRight;
-        for (IdxT j = 0; j < nclasses; ++j) {
-          int val_i   = 0;
-          auto lval_i = scdf_labels[nbins * j + i].x;
-          if (lval_i != 0) {
-            auto lval = DataT(lval_i);
-            gain += raft::myLog(lval * invLeft) / raft::myLog(DataT(2)) * lval * invlen;
-          }
-
-          val_i += lval_i;
-          auto total_sum = scdf_labels[nbins * j + nbins - 1].x;
-          auto rval_i    = total_sum - lval_i;
-          if (rval_i != 0) {
-            auto rval = DataT(rval_i);
-            gain += raft::myLog(rval * invRight) / raft::myLog(DataT(2)) * rval * invlen;
-          }
-
-          val_i += rval_i;
-          if (val_i != 0) {
-            auto val = DataT(val_i) * invlen;
-            gain -= val * raft::myLog(val) / raft::myLog(DataT(2));
-          }
-        }
-      }
-      sp.update({sbins[i], col, gain, nLeft});
+      sp.update({sbins[i], col, GainPerSplit(scdf_labels, i, nbins, len, nLeft), nLeft});
     }
     return sp;
   }
-  static DI LabelT LeafPrediction(BinT* shist, int nclasses)
+  static DI void SetLeafVector(BinT const* shist, int nclasses, DataT* out)
   {
-    // Same as Gini
-    return GiniObjectiveFunction<DataT, LabelT, IdxT>::LeafPrediction(shist, nclasses);
+    // Output probability
+    int total = 0;
+    for (int i = 0; i < nclasses; i++) {
+      total += shist[i].x;
+    }
+    for (int i = 0; i < nclasses; i++) {
+      out[i] = DataT(shist[i].x) / total;
+    }
   }
 };
 
@@ -198,73 +252,307 @@ class MSEObjectiveFunction {
   using DataT  = DataT_;
   using LabelT = LabelT_;
   using IdxT   = IdxT_;
+  using BinT   = AggregateBin;
 
  private:
-  DataT min_impurity_decrease;
   IdxT min_samples_leaf;
 
  public:
-  struct MSEBin {
-    double label_sum;
-    int count;
+  HDI MSEObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf)
+    : min_samples_leaf(min_samples_leaf)
+  {
+  }
 
-    DI static void IncrementHistogram(MSEBin* hist, int nbins, int b, double label)
-    {
-      MSEBin::AtomicAdd(hist + b, {label, 1});
-    }
-    DI static void AtomicAdd(MSEBin* address, MSEBin val)
-    {
-      atomicAdd(&address->label_sum, val.label_sum);
-      atomicAdd(&address->count, val.count);
+  /**
+   * @brief compute the Mean squared error impurity reduction (or purity gain) for each split
+   *
+   * @note This method is used to speed up the search for the best split
+   *       by calculating the gain using a proxy mean squared error reduction.
+   *       It is a proxy quantity such that the split that maximizes this value
+   *       also maximizes the impurity improvement. It neglects all constant terms
+   *       of the impurity decrease for a given split.
+   *       The Gain is the difference in the proxy impurities of the parent and the
+   *       weighted sum of impurities of its children
+   *       and is mathematically equivalent to the respective differences of
+   *       mean-squared errors.
+   */
+  HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) const
+  {
+    auto gain{DataT(0)};
+    IdxT nRight{len - nLeft};
+    auto invLen = DataT(1.0) / len;
+    // if there aren't enough samples in this split, don't bother!
+    if (nLeft < min_samples_leaf || nRight < min_samples_leaf) {
+      return -std::numeric_limits<DataT>::max();
+    } else {
+      auto label_sum        = hist[nbins - 1].label_sum;
+      DataT parent_obj      = -label_sum * label_sum * invLen;
+      DataT left_obj        = -(hist[i].label_sum * hist[i].label_sum) / nLeft;
+      DataT right_label_sum = hist[i].label_sum - label_sum;
+      DataT right_obj       = -(right_label_sum * right_label_sum) / nRight;
+      gain                  = parent_obj - (left_obj + right_obj);
+      gain *= DataT(0.5) * invLen;
+
+      return gain;
     }
-    DI MSEBin& operator+=(const MSEBin& b)
-    {
-      label_sum += b.label_sum;
-      count += b.count;
-      return *this;
+  }
+
+  DI Split<DataT, IdxT> Gain(
+    BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) const
+  {
+    Split<DataT, IdxT> sp;
+    for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) {
+      auto nLeft = shist[i].count;
+      sp.update({sbins[i], col, GainPerSplit(shist, i, nbins, len, nLeft), nLeft});
     }
-    DI MSEBin operator+(MSEBin b) const
-    {
-      b += *this;
-      return b;
+    return sp;
+  }
+
+  DI IdxT NumClasses() const { return 1; }
+
+  static DI void SetLeafVector(BinT const* shist, int nclasses, DataT* out)
+  {
+    for (int i = 0; i < nclasses; i++) {
+      out[i] = shist[i].label_sum / shist[i].count;
     }
-  };
-  using BinT = MSEBin;
-  HDI MSEObjectiveFunction(IdxT nclasses, DataT min_impurity_decrease, IdxT min_samples_leaf)
-    : min_impurity_decrease(min_impurity_decrease), min_samples_leaf(min_samples_leaf)
+  }
+};
+
+template <typename DataT_, typename LabelT_, typename IdxT_>
+class PoissonObjectiveFunction {
+ public:
+  using DataT  = DataT_;
+  using LabelT = LabelT_;
+  using IdxT   = IdxT_;
+  using BinT   = AggregateBin;
+
+ private:
+  IdxT min_samples_leaf;
+
+ public:
+  static constexpr auto eps_ = 10 * std::numeric_limits<DataT>::epsilon();
+
+  HDI PoissonObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf)
+    : min_samples_leaf(min_samples_leaf)
   {
   }
+
+  /**
+   * @brief compute the poisson impurity reduction (or purity gain) for each split
+   *
+   * @note This method is used to speed up the search for the best split
+   *       by calculating the gain using a proxy poisson half deviance reduction.
+   *       It is a proxy quantity such that the split that maximizes this value
+   *       also maximizes the impurity improvement. It neglects all constant terms
+   *       of the impurity decrease for a given split.
+   *       The Gain is the difference in the proxy impurities of the parent and the
+   *       weighted sum of impurities of its children
+   *       and is mathematically equivalent to the respective differences of
+   *       poisson half deviances.
+   */
+  HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) const
+  {
+    // get the lens'
+    IdxT nRight = len - nLeft;
+    auto invLen = DataT(1) / len;
+
+    // if there aren't enough samples in this split, don't bother!
+    if (nLeft < min_samples_leaf || nRight < min_samples_leaf)
+      return -std::numeric_limits<DataT>::max();
+
+    auto label_sum       = hist[nbins - 1].label_sum;
+    auto left_label_sum  = (hist[i].label_sum);
+    auto right_label_sum = (hist[nbins - 1].label_sum - hist[i].label_sum);
+
+    // label sum cannot be non-positive
+    if (label_sum < eps_ || left_label_sum < eps_ || right_label_sum < eps_)
+      return -std::numeric_limits<DataT>::max();
+
+    // compute the gain to be
+    DataT parent_obj = -label_sum * raft::myLog(label_sum * invLen);
+    DataT left_obj   = -left_label_sum * raft::myLog(left_label_sum / nLeft);
+    DataT right_obj  = -right_label_sum * raft::myLog(right_label_sum / nRight);
+    DataT gain       = parent_obj - (left_obj + right_obj);
+    gain             = gain * invLen;
+
+    return gain;
+  }
+
+  DI Split<DataT, IdxT> Gain(
+    BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) const
+  {
+    Split<DataT, IdxT> sp;
+    for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) {
+      auto nLeft = shist[i].count;
+      sp.update({sbins[i], col, GainPerSplit(shist, i, nbins, len, nLeft), nLeft});
+    }
+    return sp;
+  }
+
   DI IdxT NumClasses() const { return 1; }
-  DI Split<DataT, IdxT> Gain(BinT* shist, DataT* sbins, IdxT col, IdxT len, IdxT nbins)
+
+  static DI void SetLeafVector(BinT const* shist, int nclasses, DataT* out)
+  {
+    for (int i = 0; i < nclasses; i++) {
+      out[i] = shist[i].label_sum / shist[i].count;
+    }
+  }
+};
+
+template <typename DataT_, typename LabelT_, typename IdxT_>
+class GammaObjectiveFunction {
+ public:
+  using DataT                = DataT_;
+  using LabelT               = LabelT_;
+  using IdxT                 = IdxT_;
+  using BinT                 = AggregateBin;
+  static constexpr auto eps_ = 10 * std::numeric_limits<DataT>::epsilon();
+
+ private:
+  IdxT min_samples_leaf;
+
+ public:
+  HDI GammaObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf)
+    : min_samples_leaf{min_samples_leaf}
+  {
+  }
+
+  /**
+   * @brief compute the gamma impurity reduction (or purity gain) for each split
+   *
+   * @note This method is used to speed up the search for the best split
+   *       by calculating the gain using a proxy gamma half deviance reduction.
+   *       It is a proxy quantity such that the split that maximizes this value
+   *       also maximizes the impurity improvement. It neglects all constant terms
+   *       of the impurity decrease for a given split.
+   *       The Gain is the difference in the proxy impurities of the parent and the
+   *       weighted sum of impurities of its children
+   *       and is mathematically equivalent to the respective differences of
+   *       gamma half deviances.
+   */
+  HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) const
+  {
+    IdxT nRight = len - nLeft;
+    auto invLen = DataT(1) / len;
+
+    // if there aren't enough samples in this split, don't bother!
+    if (nLeft < min_samples_leaf || nRight < min_samples_leaf)
+      return -std::numeric_limits<DataT>::max();
+
+    DataT label_sum       = hist[nbins - 1].label_sum;
+    DataT left_label_sum  = (hist[i].label_sum);
+    DataT right_label_sum = (hist[nbins - 1].label_sum - hist[i].label_sum);
+
+    // label sum cannot be non-positive
+    if (label_sum < eps_ || left_label_sum < eps_ || right_label_sum < eps_)
+      return -std::numeric_limits<DataT>::max();
+
+    // compute the gain to be
+    DataT parent_obj = len * raft::myLog(label_sum * invLen);
+    DataT left_obj   = nLeft * raft::myLog(left_label_sum / nLeft);
+    DataT right_obj  = nRight * raft::myLog(right_label_sum / nRight);
+    DataT gain       = parent_obj - (left_obj + right_obj);
+    gain             = gain * invLen;
+
+    return gain;
+  }
+
+  DI Split<DataT, IdxT> Gain(
+    BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) const
   {
     Split<DataT, IdxT> sp;
-    auto invlen = DataT(1.0) / len;
     for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) {
-      auto nLeft  = shist[i].count;
-      auto nRight = len - nLeft;
-      DataT gain;
-      // if there aren't enough samples in this split, don't bother!
-      if (nLeft < min_samples_leaf || nRight < min_samples_leaf) {
-        gain = -std::numeric_limits<DataT>::max();
-      } else {
-        auto label_sum        = shist[nbins - 1].label_sum;
-        DataT parent_obj      = -label_sum * label_sum / len;
-        DataT left_obj        = -(shist[i].label_sum * shist[i].label_sum) / nLeft;
-        DataT right_label_sum = shist[i].label_sum - label_sum;
-        DataT right_obj       = -(right_label_sum * right_label_sum) / nRight;
-        gain                  = parent_obj - (left_obj + right_obj);
-        gain *= invlen;
-      }
-      sp.update({sbins[i], col, gain, nLeft});
+      auto nLeft = shist[i].count;
+      sp.update({sbins[i], col, GainPerSplit(shist, i, nbins, len, nLeft), nLeft});
     }
     return sp;
   }
+  DI IdxT NumClasses() const { return 1; }
 
-  static DI LabelT LeafPrediction(BinT* shist, int nclasses)
+  static DI void SetLeafVector(BinT const* shist, int nclasses, DataT* out)
   {
-    return shist[0].label_sum / shist[0].count;
+    for (int i = 0; i < nclasses; i++) {
+      out[i] = shist[i].label_sum / shist[i].count;
+    }
   }
 };
 
-}  // namespace DT
-}  // namespace ML
+template <typename DataT_, typename LabelT_, typename IdxT_>
+class InverseGaussianObjectiveFunction {
+ public:
+  using DataT                = DataT_;
+  using LabelT               = LabelT_;
+  using IdxT                 = IdxT_;
+  using BinT                 = AggregateBin;
+  static constexpr auto eps_ = 10 * std::numeric_limits<DataT>::epsilon();
+
+ private:
+  IdxT min_samples_leaf;
+
+ public:
+  HDI InverseGaussianObjectiveFunction(IdxT nclasses, IdxT min_samples_leaf)
+    : min_samples_leaf{min_samples_leaf}
+  {
+  }
+
+  /**
+   * @brief compute the inverse gaussian impurity reduction (or purity gain) for each split
+   *
+   * @note This method is used to speed up the search for the best split
+   *       by calculating the gain using a proxy inverse gaussian half deviance reduction.
+   *       It is a proxy quantity such that the split that maximizes this value
+   *       also maximizes the impurity improvement. It neglects all constant terms
+   *       of the impurity decrease for a given split.
+   *       The Gain is the difference in the proxy impurities of the parent and the
+   *       weighted sum of impurities of its children
+   *       and is mathematically equivalent to the respective differences of
+   *       inverse gaussian deviances.
+   */
+  HDI DataT GainPerSplit(BinT const* hist, IdxT i, IdxT nbins, IdxT len, IdxT nLeft) const
+  {
+    // get the lens'
+    IdxT nRight = len - nLeft;
+
+    // if there aren't enough samples in this split, don't bother!
+    if (nLeft < min_samples_leaf || nRight < min_samples_leaf)
+      return -std::numeric_limits<DataT>::max();
+
+    auto label_sum       = hist[nbins - 1].label_sum;
+    auto left_label_sum  = (hist[i].label_sum);
+    auto right_label_sum = (hist[nbins - 1].label_sum - hist[i].label_sum);
+
+    // label sum cannot be non-positive
+    if (label_sum < eps_ || left_label_sum < eps_ || right_label_sum < eps_)
+      return -std::numeric_limits<DataT>::max();
+
+    // compute the gain to be
+    DataT parent_obj = -DataT(len) * DataT(len) / label_sum;
+    DataT left_obj   = -DataT(nLeft) * DataT(nLeft) / left_label_sum;
+    DataT right_obj  = -DataT(nRight) * DataT(nRight) / right_label_sum;
+    DataT gain       = parent_obj - (left_obj + right_obj);
+    gain             = gain / (2 * len);
+
+    return gain;
+  }
+
+  DI Split<DataT, IdxT> Gain(
+    BinT const* shist, DataT const* sbins, IdxT col, IdxT len, IdxT nbins) const
+  {
+    Split<DataT, IdxT> sp;
+    for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) {
+      auto nLeft = shist[i].count;
+      sp.update({sbins[i], col, GainPerSplit(shist, i, nbins, len, nLeft), nLeft});
+    }
+    return sp;
+  }
+  DI IdxT NumClasses() const { return 1; }
+
+  static DI void SetLeafVector(BinT const* shist, int nclasses, DataT* out)
+  {
+    for (int i = 0; i < nclasses; i++) {
+      out[i] = shist[i].label_sum / shist[i].count;
+    }
+  }
+};
+}  // end namespace DT
+}  // end namespace ML
\ No newline at end of file
diff --git a/cpp/src/decisiontree/decisiontree.cu b/cpp/src/decisiontree/decisiontree.cu
index 14272823bc..d1c3463e41 100644
--- a/cpp/src/decisiontree/decisiontree.cu
+++ b/cpp/src/decisiontree/decisiontree.cu
@@ -95,14 +95,14 @@ template <class T, class L>
 std::string get_tree_text(const TreeMetaDataNode<T, L>* tree)
 {
   std::string summary = get_tree_summary_text<T, L>(tree);
-  return summary + "\n" + get_node_text<T, L>("", tree->sparsetree, 0, false);
+  return summary + "\n" + get_node_text<T, L>("", tree, 0, false);
 }
 
 template <class T, class L>
 std::string get_tree_json(const TreeMetaDataNode<T, L>* tree)
 {
   std::ostringstream oss;
-  return get_node_json("", tree->sparsetree, 0);
+  return get_node_json("", tree, 0);
 }
 
 // Functions' specializations
diff --git a/cpp/src/decisiontree/decisiontree.cuh b/cpp/src/decisiontree/decisiontree.cuh
index c079cd6a68..c06d587539 100644
--- a/cpp/src/decisiontree/decisiontree.cuh
+++ b/cpp/src/decisiontree/decisiontree.cuh
@@ -73,56 +73,65 @@ inline bool is_dev_ptr(const void* p)
   }
 }
 
+template <typename T>
+std::string to_string_high_precision(T x)
+{
+  static_assert(std::is_floating_point<T>::value || std::is_integral<T>::value,
+                "T must be float, double, or integer");
+  std::ostringstream oss;
+  oss.imbue(std::locale::classic());  // use C locale
+  if (std::is_floating_point<T>::value) {
+    oss << std::setprecision(std::numeric_limits<T>::max_digits10) << x;
+  } else {
+    oss << x;
+  }
+  return oss.str();
+}
+
 template <class T, class L>
 std::string get_node_text(const std::string& prefix,
-                          const std::vector<SparseTreeNode<T, L>>& sparsetree,
+                          const TreeMetaDataNode<T, L>* tree,
                           int idx,
                           bool isLeft)
 {
-  const SparseTreeNode<T, L>& node = sparsetree[idx];
+  const SparseTreeNode<T, L>& node = tree->sparsetree[idx];
 
   std::ostringstream oss;
 
   // print the value of the node
-  std::stringstream ss;
-  ss << prefix.c_str();
-  ss << (isLeft ? "├" : "└");
-  ss << node;
+  oss << prefix.c_str();
+  oss << (isLeft ? "├" : "└");
 
-  oss << ss.str();
+  if (node.IsLeaf()) {
+    oss << "(leaf, "
+        << "prediction: [";
+
+    for (int k = 0; k < tree->num_outputs - 1; k++) {
+      oss << tree->vector_leaf[idx * tree->num_outputs + k] << ", ";
+    }
+    oss << tree->vector_leaf[idx * tree->num_outputs + tree->num_outputs - 1];
+
+    oss << "], best_metric_val: " << node.BestMetric() << ")";
+  } else {
+    oss << "("
+        << "colid: " << node.ColumnId() << ", quesval: " << node.QueryValue()
+        << ", best_metric_val: " << node.BestMetric() << ")";
+  }
 
   if (!node.IsLeaf()) {
     // enter the next tree level - left and right branch
     oss << "\n"
-        << get_node_text(prefix + (isLeft ? "│   " : "    "), sparsetree, node.LeftChildId(), true)
+        << get_node_text(prefix + (isLeft ? "│   " : "    "), tree, node.LeftChildId(), true)
         << "\n"
-        << get_node_text(
-             prefix + (isLeft ? "│   " : "    "), sparsetree, node.RightChildId(), false);
-  }
-  return oss.str();
-}
-
-template <typename T>
-std::string to_string_high_precision(T x)
-{
-  static_assert(std::is_floating_point<T>::value || std::is_integral<T>::value,
-                "T must be float, double, or integer");
-  std::ostringstream oss;
-  oss.imbue(std::locale::classic());  // use C locale
-  if (std::is_floating_point<T>::value) {
-    oss << std::setprecision(std::numeric_limits<T>::max_digits10) << x;
-  } else {
-    oss << x;
+        << get_node_text(prefix + (isLeft ? "│   " : "    "), tree, node.RightChildId(), false);
   }
   return oss.str();
 }
 
 template <class T, class L>
-std::string get_node_json(const std::string& prefix,
-                          const std::vector<SparseTreeNode<T, L>>& sparsetree,
-                          int idx)
+std::string get_node_json(const std::string& prefix, const TreeMetaDataNode<T, L>* tree, int idx)
 {
-  const SparseTreeNode<T, L>& node = sparsetree[idx];
+  const SparseTreeNode<T, L>& node = tree->sparsetree[idx];
 
   std::ostringstream oss;
   if (!node.IsLeaf()) {
@@ -133,32 +142,22 @@ std::string get_node_json(const std::string& prefix,
     oss << ", \"yes\": " << node.LeftChildId() << ", \"no\": " << (node.RightChildId())
         << ", \"children\": [\n";
     // enter the next tree level - left and right branch
-    oss << get_node_json(prefix + "  ", sparsetree, node.LeftChildId()) << ",\n"
-        << get_node_json(prefix + "  ", sparsetree, node.RightChildId()) << "\n"
+    oss << get_node_json(prefix + "  ", tree, node.LeftChildId()) << ",\n"
+        << get_node_json(prefix + "  ", tree, node.RightChildId()) << "\n"
         << prefix << "]}";
   } else {
-    oss << prefix << "{\"nodeid\": " << idx
-        << ", \"leaf_value\": " << to_string_high_precision(node.Prediction());
-    oss << ", \"instance_count\": " << node.InstanceCount();
+    oss << prefix << "{\"nodeid\": " << idx << ", \"leaf_value\": [";
+    for (int k = 0; k < tree->num_outputs - 1; k++) {
+      oss << to_string_high_precision(tree->vector_leaf[idx * tree->num_outputs + k]) << ", ";
+    }
+    oss << to_string_high_precision(
+      tree->vector_leaf[idx * tree->num_outputs + tree->num_outputs - 1]);
+    oss << "], \"instance_count\": " << node.InstanceCount();
     oss << "}";
   }
   return oss.str();
 }
 
-template <typename T, typename L>
-std::ostream& operator<<(std::ostream& os, const SparseTreeNode<T, L>& node)
-{
-  if (node.IsLeaf()) {
-    os << "(leaf, "
-       << "prediction: " << node.Prediction() << ", best_metric_val: " << node.BestMetric() << ")";
-  } else {
-    os << "("
-       << "colid: " << node.ColumnId() << ", quesval: " << node.QueryValue()
-       << ", best_metric_val: " << node.BestMetric() << ")";
-  }
-  return os;
-}
-
 template <class T, class L>
 tl::Tree<T, T> build_treelite_tree(const DT::TreeMetaDataNode<T, L>& rf_tree,
                                    unsigned int num_class)
@@ -188,7 +187,8 @@ tl::Tree<T, T> build_treelite_tree(const DT::TreeMetaDataNode<T, L>& rf_tree,
     next_level_queue.resize(std::max(2 * cur_level_size, next_level_queue.size()));
 
     for (size_t i = 0; i < cur_level_size; ++i) {
-      const SparseTreeNode<T, L>& q_node = rf_tree.sparsetree[cur_level_queue[cur_front].first];
+      auto cuml_node_id                  = cur_level_queue[cur_front].first;
+      const SparseTreeNode<T, L>& q_node = rf_tree.sparsetree[cuml_node_id];
       auto tl_node_id                    = cur_level_queue[cur_front].second;
       ++cur_front;
 
@@ -208,11 +208,11 @@ tl::Tree<T, T> build_treelite_tree(const DT::TreeMetaDataNode<T, L>& rf_tree,
           tl_node_id, q_node.ColumnId(), q_node.QueryValue(), true, tl::Operator::kLE);
 
       } else {
+        auto leaf_begin = rf_tree.vector_leaf.begin() + cuml_node_id * num_class;
         if (num_class == 1) {
-          tl_tree.SetLeaf(tl_node_id, static_cast<T>(q_node.Prediction()));
+          tl_tree.SetLeaf(tl_node_id, *leaf_begin);
         } else {
-          std::vector<T> leaf_vector(num_class, 0);
-          leaf_vector[q_node.Prediction()] = 1;
+          std::vector<T> leaf_vector(leaf_begin, leaf_begin + num_class);
           tl_tree.SetLeafVector(tl_node_id, leaf_vector);
         }
       }
@@ -232,6 +232,7 @@ class DecisionTree {
   template <class DataT, class LabelT>
   static std::shared_ptr<DT::TreeMetaDataNode<DataT, LabelT>> fit(
     const raft::handle_t& handle,
+    const cudaStream_t s,
     const DataT* data,
     const int ncols,
     const int nrows,
@@ -253,6 +254,7 @@ class DecisionTree {
     // Dispatch objective
     if (params.split_criterion == CRITERION::GINI) {
       return Builder<GiniObjectiveFunction<DataT, LabelT, IdxT>>(handle,
+                                                                 s,
                                                                  treeid,
                                                                  seed,
                                                                  params,
@@ -266,6 +268,7 @@ class DecisionTree {
         .train();
     } else if (params.split_criterion == CRITERION::ENTROPY) {
       return Builder<EntropyObjectiveFunction<DataT, LabelT, IdxT>>(handle,
+                                                                    s,
                                                                     treeid,
                                                                     seed,
                                                                     params,
@@ -279,6 +282,7 @@ class DecisionTree {
         .train();
     } else if (params.split_criterion == CRITERION::MSE) {
       return Builder<MSEObjectiveFunction<DataT, LabelT, IdxT>>(handle,
+                                                                s,
                                                                 treeid,
                                                                 seed,
                                                                 params,
@@ -290,6 +294,48 @@ class DecisionTree {
                                                                 unique_labels,
                                                                 quantiles)
         .train();
+    } else if (params.split_criterion == CRITERION::POISSON) {
+      return Builder<PoissonObjectiveFunction<DataT, LabelT, IdxT>>(handle,
+                                                                    s,
+                                                                    treeid,
+                                                                    seed,
+                                                                    params,
+                                                                    data,
+                                                                    labels,
+                                                                    nrows,
+                                                                    ncols,
+                                                                    rowids,
+                                                                    unique_labels,
+                                                                    quantiles)
+        .train();
+    } else if (params.split_criterion == CRITERION::GAMMA) {
+      return Builder<GammaObjectiveFunction<DataT, LabelT, IdxT>>(handle,
+                                                                  s,
+                                                                  treeid,
+                                                                  seed,
+                                                                  params,
+                                                                  data,
+                                                                  labels,
+                                                                  nrows,
+                                                                  ncols,
+                                                                  rowids,
+                                                                  unique_labels,
+                                                                  quantiles)
+        .train();
+    } else if (params.split_criterion == CRITERION::INVERSE_GAUSSIAN) {
+      return Builder<InverseGaussianObjectiveFunction<DataT, LabelT, IdxT>>(handle,
+                                                                            s,
+                                                                            treeid,
+                                                                            seed,
+                                                                            params,
+                                                                            data,
+                                                                            labels,
+                                                                            nrows,
+                                                                            ncols,
+                                                                            rowids,
+                                                                            unique_labels,
+                                                                            quantiles)
+        .train();
     } else {
       ASSERT(false, "Unknown split criterion.");
     }
@@ -297,11 +343,12 @@ class DecisionTree {
 
   template <class DataT, class LabelT>
   static void predict(const raft::handle_t& handle,
-                      const DT::TreeMetaDataNode<DataT, LabelT>* tree,
+                      const DT::TreeMetaDataNode<DataT, LabelT>& tree,
                       const DataT* rows,
-                      const int n_rows,
-                      const int n_cols,
-                      LabelT* predictions,
+                      std::size_t n_rows,
+                      std::size_t n_cols,
+                      DataT* predictions,
+                      int num_outputs,
                       int verbosity)
   {
     if (verbosity >= 0) { ML::Logger::get().setLevel(verbosity); }
@@ -309,44 +356,44 @@ class DecisionTree {
            "DT Error: Current impl. expects both input and predictions to be CPU "
            "pointers.\n");
 
-    ASSERT(tree && (tree->sparsetree.size() != 0),
+    ASSERT(tree.sparsetree.size() != 0,
            "Cannot predict w/ empty tree, tree size %zu",
-           tree->sparsetree.size());
-    ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows);
-    ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols);
+           tree.sparsetree.size());
 
-    predict_all(tree, rows, n_rows, n_cols, predictions);
+    predict_all(tree, rows, n_rows, n_cols, predictions, num_outputs);
   }
 
   template <class DataT, class LabelT>
-  static void predict_all(const DT::TreeMetaDataNode<DataT, LabelT>* tree,
+  static void predict_all(const DT::TreeMetaDataNode<DataT, LabelT>& tree,
                           const DataT* rows,
-                          const int n_rows,
-                          const int n_cols,
-                          LabelT* preds)
+                          std::size_t n_rows,
+                          std::size_t n_cols,
+                          DataT* preds,
+                          int num_outputs)
   {
-    for (int row_id = 0; row_id < n_rows; row_id++) {
-      preds[row_id] = predict_one(&rows[row_id * n_cols], tree->sparsetree, 0);
+    for (std::size_t row_id = 0; row_id < n_rows; row_id++) {
+      predict_one(&rows[row_id * n_cols], tree, preds + row_id * num_outputs, num_outputs);
     }
   }
 
   template <class DataT, class LabelT>
-  static LabelT predict_one(const DataT* row,
-                            const std::vector<SparseTreeNode<DataT, LabelT>>& sparsetree,
-                            int idx)
+  static void predict_one(const DataT* row,
+                          const DT::TreeMetaDataNode<DataT, LabelT>& tree,
+                          DataT* preds_out,
+                          int num_outputs)
   {
-    auto colid     = sparsetree[idx].ColumnId();
-    DataT quesval  = sparsetree[idx].QueryValue();
-    auto leftchild = sparsetree[idx].LeftChildId();
-    if (sparsetree[idx].IsLeaf()) {
-      CUML_LOG_DEBUG("Leaf node. Predicting %f", (float)sparsetree[idx].Prediction());
-      return sparsetree[idx].Prediction();
-    } else if (row[colid] <= quesval) {
-      CUML_LOG_DEBUG("Classifying Left @ node w/ column %d and value %f", colid, (float)quesval);
-      return predict_one(row, sparsetree, leftchild);
-    } else {
-      CUML_LOG_DEBUG("Classifying Right @ node w/ column %d and value %f", colid, (float)quesval);
-      return predict_one(row, sparsetree, leftchild + 1);
+    std::size_t idx = 0;
+    auto n          = tree.sparsetree[idx];
+    while (!n.IsLeaf()) {
+      if (row[n.ColumnId()] <= n.QueryValue()) {
+        idx = n.LeftChildId();
+      } else {
+        idx = n.RightChildId();
+      }
+      n = tree.sparsetree[idx];
+    }
+    for (int i = 0; i < num_outputs; i++) {
+      preds_out[i] += tree.vector_leaf[idx * num_outputs + i];
     }
   }
 
diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 58dd3dcdce..7de2eb8efd 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -38,21 +38,17 @@ __host__ __device__ __forceinline__ int forest_num_nodes(int num_trees, int dept
   return num_trees * tree_num_nodes(depth);
 }
 
-template <>
-__host__ __device__ __forceinline__ float base_node::output<float>() const
-{
-  return val.f;
-}
-template <>
-__host__ __device__ __forceinline__ int base_node::output<int>() const
-{
-  return val.idx;
-}
+struct storage_base {
+  categorical_sets sets_;
+  float* vector_leaf_;
+  bool cats_present() const { return sets_.cats_present(); }
+};
 
 /** dense_tree represents a dense tree */
-struct dense_tree {
-  __host__ __device__ dense_tree(dense_node* nodes, int node_pitch)
-    : nodes_(nodes), node_pitch_(node_pitch)
+
+struct dense_tree : tree_base {
+  __host__ __device__ dense_tree(categorical_sets cat_sets, dense_node* nodes, int node_pitch)
+    : tree_base{cat_sets}, nodes_(nodes), node_pitch_(node_pitch)
   {
   }
   __host__ __device__ const dense_node& operator[](int i) const { return nodes_[i * node_pitch_]; }
@@ -61,51 +57,57 @@ struct dense_tree {
 };
 
 /** dense_storage stores the forest as a collection of dense nodes */
-struct dense_storage {
-  __host__ __device__ dense_storage(
-    dense_node* nodes, int num_trees, int tree_stride, int node_pitch, float* vector_leaf)
-    : nodes_(nodes),
+struct dense_storage : storage_base {
+  __host__ __device__ dense_storage(categorical_sets cat_sets,
+                                    float* vector_leaf,
+                                    dense_node* nodes,
+                                    int num_trees,
+                                    int tree_stride,
+                                    int node_pitch)
+    : storage_base{cat_sets, vector_leaf},
+      nodes_(nodes),
       num_trees_(num_trees),
       tree_stride_(tree_stride),
-      node_pitch_(node_pitch),
-      vector_leaf_(vector_leaf)
+      node_pitch_(node_pitch)
   {
   }
   __host__ __device__ int num_trees() const { return num_trees_; }
   __host__ __device__ dense_tree operator[](int i) const
   {
-    return dense_tree(nodes_ + i * tree_stride_, node_pitch_);
+    return dense_tree(sets_, nodes_ + i * tree_stride_, node_pitch_);
   }
-  dense_node* nodes_  = nullptr;
-  float* vector_leaf_ = nullptr;
-  int num_trees_      = 0;
-  int tree_stride_    = 0;
-  int node_pitch_     = 0;
+  dense_node* nodes_ = nullptr;
+  int num_trees_     = 0;
+  int tree_stride_   = 0;
+  int node_pitch_    = 0;
 };
 
 /** sparse_tree is a sparse tree */
 template <typename node_t>
-struct sparse_tree {
-  __host__ __device__ sparse_tree(node_t* nodes) : nodes_(nodes) {}
+struct sparse_tree : tree_base {
+  __host__ __device__ sparse_tree(categorical_sets cat_sets, node_t* nodes)
+    : tree_base{cat_sets}, nodes_(nodes)
+  {
+  }
   __host__ __device__ const node_t& operator[](int i) const { return nodes_[i]; }
   node_t* nodes_ = nullptr;
 };
 
 /** sparse_storage stores the forest as a collection of sparse nodes */
 template <typename node_t>
-struct sparse_storage {
-  int* trees_         = nullptr;
-  node_t* nodes_      = nullptr;
-  float* vector_leaf_ = nullptr;
-  int num_trees_      = 0;
-  __host__ __device__ sparse_storage(int* trees, node_t* nodes, int num_trees, float* vector_leaf)
-    : trees_(trees), nodes_(nodes), num_trees_(num_trees), vector_leaf_(vector_leaf)
+struct sparse_storage : storage_base {
+  int* trees_    = nullptr;
+  node_t* nodes_ = nullptr;
+  int num_trees_ = 0;
+  __host__ __device__ sparse_storage(
+    categorical_sets cat_sets, float* vector_leaf, int* trees, node_t* nodes, int num_trees)
+    : storage_base{cat_sets, vector_leaf}, trees_(trees), nodes_(nodes), num_trees_(num_trees)
   {
   }
   __host__ __device__ int num_trees() const { return num_trees_; }
   __host__ __device__ sparse_tree<node_t> operator[](int i) const
   {
-    return sparse_tree<node_t>(&nodes_[trees_[i]]);
+    return sparse_tree<node_t>(sets_, &nodes_[trees_[i]]);
   }
 };
 
@@ -128,12 +130,17 @@ struct shmem_size_params {
   /// are the input columns are prefetched into shared
   /// memory before inferring the row in question
   bool cols_in_shmem = true;
+  // are there categorical inner nodes? doesn't currently affect shared memory size,
+  // but participates in template dispatch and may affect it later
+  bool cats_present = false;
   /// log2_threads_per_tree determines how many threads work on a single tree
   /// at once inside a block (sharing trees means splitting input rows)
   int log2_threads_per_tree = 0;
   /// n_items is how many input samples (items) any thread processes. If 0 is given,
-  /// choose the reasonable most (<=4) that fit into shared memory. See init_n_items()
+  /// choose the reasonable most (<= MAX_N_ITEMS) that fit into shared memory. See init_n_items()
   int n_items = 0;
+  // block_dim_x is the CUDA block size. Set by dispatch_on_leaf_algo(...)
+  int block_dim_x = 0;
   /// shm_sz is the associated shared memory footprint
   int shm_sz = INT_MAX;
 
@@ -145,9 +152,6 @@ struct shmem_size_params {
   {
     return cols_in_shmem ? sizeof(float) * sdata_stride() * n_items << log2_threads_per_tree : 0;
   }
-  void compute_smem_footprint();
-  template <int NITEMS>
-  size_t get_smem_footprint();
   template <int NITEMS, leaf_algo_t leaf_algo>
   size_t get_smem_footprint();
 };
@@ -169,9 +173,118 @@ struct predict_params : shmem_size_params {
   // to signal infer kernel to apply softmax and also average prior to that
   // for GROVE_PER_CLASS for predict_proba
   output_t transform;
+  // number of blocks to launch
   int num_blocks;
 };
 
+constexpr leaf_algo_t next_leaf_algo(leaf_algo_t algo)
+{
+  return static_cast<leaf_algo_t>(algo + 1);
+}
+
+template <bool COLS_IN_SHMEM_    = false,
+          bool CATS_SUPPORTED_   = false,
+          leaf_algo_t LEAF_ALGO_ = MIN_LEAF_ALGO,
+          int N_ITEMS_           = 1>
+struct KernelTemplateParams {
+  static const bool COLS_IN_SHMEM    = COLS_IN_SHMEM_;
+  static const bool CATS_SUPPORTED   = CATS_SUPPORTED_;
+  static const leaf_algo_t LEAF_ALGO = LEAF_ALGO_;
+  static const int N_ITEMS           = N_ITEMS_;
+
+  template <bool _cats_supported>
+  using ReplaceCatsSupported =
+    KernelTemplateParams<COLS_IN_SHMEM, _cats_supported, LEAF_ALGO, N_ITEMS>;
+  using NextLeafAlgo =
+    KernelTemplateParams<COLS_IN_SHMEM, CATS_SUPPORTED, next_leaf_algo(LEAF_ALGO), N_ITEMS>;
+  template <leaf_algo_t NEW_LEAF_ALGO>
+  using ReplaceLeafAlgo =
+    KernelTemplateParams<COLS_IN_SHMEM, CATS_SUPPORTED, NEW_LEAF_ALGO, N_ITEMS>;
+  using IncNItems = KernelTemplateParams<COLS_IN_SHMEM, CATS_SUPPORTED, LEAF_ALGO, N_ITEMS + 1>;
+};
+
+// inherit from this struct to pass the functor to dispatch_on_fil_template_params()
+// compiler will prevent defining a .run() method with a different output type
+template <typename T>
+struct dispatch_functor {
+  typedef T return_t;
+  template <class KernelParams = KernelTemplateParams<>>
+  T run(predict_params);
+};
+
+namespace dispatch {
+
+template <class KernelParams, class Func, class T = typename Func::return_t>
+T dispatch_on_n_items(Func func, predict_params params)
+{
+  if (params.n_items == KernelParams::N_ITEMS) {
+    return func.template run<KernelParams>(params);
+  } else if constexpr (KernelParams::N_ITEMS < MAX_N_ITEMS) {
+    return dispatch_on_n_items<class KernelParams::IncNItems>(func, params);
+  } else {
+    ASSERT(false, "n_items > %d or < 1", MAX_N_ITEMS);
+  }
+  return T();  // appeasing the compiler
+}
+
+template <class KernelParams, class Func, class T = typename Func::return_t>
+T dispatch_on_leaf_algo(Func func, predict_params params)
+{
+  if (params.leaf_algo == KernelParams::LEAF_ALGO) {
+    if constexpr (KernelParams::LEAF_ALGO == GROVE_PER_CLASS) {
+      if (params.num_classes <= FIL_TPB) {
+        params.block_dim_x = FIL_TPB - FIL_TPB % params.num_classes;
+        using Next         = typename KernelParams::ReplaceLeafAlgo<GROVE_PER_CLASS_FEW_CLASSES>;
+        return dispatch_on_n_items<Next>(func, params);
+      } else {
+        params.block_dim_x = FIL_TPB;
+        using Next         = typename KernelParams::ReplaceLeafAlgo<GROVE_PER_CLASS_MANY_CLASSES>;
+        return dispatch_on_n_items<Next>(func, params);
+      }
+    } else {
+      params.block_dim_x = FIL_TPB;
+      return dispatch_on_n_items<KernelParams>(func, params);
+    }
+  } else if constexpr (next_leaf_algo(KernelParams::LEAF_ALGO) <= MAX_LEAF_ALGO) {
+    return dispatch_on_leaf_algo<class KernelParams::NextLeafAlgo>(func, params);
+  } else {
+    ASSERT(false, "internal error: dispatch: invalid leaf_algo %d", params.leaf_algo);
+  }
+  return T();  // appeasing the compiler
+}
+
+template <class KernelParams, class Func, class T = typename Func::return_t>
+T dispatch_on_cats_supported(Func func, predict_params params)
+{
+  return params.cats_present
+           ? dispatch_on_leaf_algo<typename KernelParams::ReplaceCatsSupported<true>>(func, params)
+           : dispatch_on_leaf_algo<typename KernelParams::ReplaceCatsSupported<false>>(func,
+                                                                                       params);
+}
+
+template <class Func, class T = typename Func::return_t>
+T dispatch_on_cols_in_shmem(Func func, predict_params params)
+{
+  return params.cols_in_shmem
+           ? dispatch_on_cats_supported<KernelTemplateParams<true>>(func, params)
+           : dispatch_on_cats_supported<KernelTemplateParams<false>>(func, params);
+}
+
+}  // namespace dispatch
+
+template <class Func, class T = typename Func::return_t>
+T dispatch_on_fil_template_params(Func func, predict_params params)
+{
+  return dispatch::dispatch_on_cols_in_shmem(func, params);
+}
+
+// For an example of Func declaration, see this.
+// the .run(predict_params) method will be defined in infer.cu
+struct compute_smem_footprint : dispatch_functor<int> {
+  template <class KernelParams = KernelTemplateParams<>>
+  int run(predict_params);
+};
+
 // infer() calls the inference kernel with the parameters on the stream
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream);
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 2d7b93e148..62a7c7c8c8 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -24,7 +24,6 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <rmm/device_uvector.hpp>
 
 #include <treelite/c_api.h>
 #include <treelite/tree.h>
@@ -36,8 +35,10 @@
 #include <omp.h>
 
 #include <algorithm>
+#include <bitset>
 #include <cmath>
 #include <cstddef>
+#include <cstdint>
 #include <iomanip>
 #include <limits>
 #include <stack>
@@ -48,6 +49,20 @@ namespace fil {
 
 namespace tl = treelite;
 
+std::ostream& operator<<(std::ostream& os, const cat_sets_owner& cso)
+{
+  os << "\nbits { ";
+  for (uint8_t b : cso.bits) {
+    os << std::bitset<BITS_PER_BYTE>(b) << " ";
+  }
+  os << " }\nmax_matching {";
+  for (int mm : cso.max_matching) {
+    os << mm << " ";
+  }
+  os << " }";
+  return os;
+}
+
 __host__ __device__ float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); }
 
 /** performs additional transformations on the array of forest predictions
@@ -82,8 +97,13 @@ __global__ void transform_k(float* preds,
     preds[i] = result;
 }
 
+// needed to avoid expanding the dispatch template into unresolved
+// compute_smem_footprint::run() calls. In infer.cu, we don't export those symbols,
+// but rather one symbol for the whole template specialization, as below.
+extern template int dispatch_on_fil_template_params(compute_smem_footprint, predict_params);
+
 struct forest {
-  forest(const raft::handle_t& h) : vector_leaf_(0, h.get_stream()) {}
+  forest(const raft::handle_t& h) : vector_leaf_(0, h.get_stream()), cat_sets_(h.get_stream()) {}
 
   void init_n_items(int device)
   {
@@ -110,14 +130,14 @@ struct forest {
       shmem_size_params& ssp_ = predict_proba ? proba_ssp_ : class_ssp_;
       ssp_.predict_proba      = predict_proba;
       shmem_size_params ssp   = ssp_;
-      // if n_items was not provided, try from 1 to 4. Otherwise, use as-is.
+      // if n_items was not provided, try from 1 to MAX_N_ITEMS. Otherwise, use as-is.
       int min_n_items = ssp.n_items == 0 ? 1 : ssp.n_items;
       int max_n_items =
-        ssp.n_items == 0 ? (algo_ == algo_t::BATCH_TREE_REORG ? 4 : 1) : ssp.n_items;
+        ssp.n_items == 0 ? (algo_ == algo_t::BATCH_TREE_REORG ? MAX_N_ITEMS : 1) : ssp.n_items;
       for (bool cols_in_shmem : {false, true}) {
         ssp.cols_in_shmem = cols_in_shmem;
         for (ssp.n_items = min_n_items; ssp.n_items <= max_n_items; ++ssp.n_items) {
-          ssp.compute_smem_footprint();
+          ssp.shm_sz = dispatch_on_fil_template_params(compute_smem_footprint(), ssp);
           if (ssp.shm_sz < max_shm) ssp_ = ssp;
         }
       }
@@ -132,17 +152,15 @@ struct forest {
     int max_threads_per_sm, sm_count;
     CUDA_CHECK(
       cudaDeviceGetAttribute(&max_threads_per_sm, cudaDevAttrMaxThreadsPerMultiProcessor, device));
-    int max_blocks_per_sm = max_threads_per_sm / FIL_TPB;
-    ASSERT(blocks_per_sm <= max_blocks_per_sm,
-           "on this GPU, FIL blocks_per_sm cannot exceed %d",
-           max_blocks_per_sm);
+    blocks_per_sm = std::min(blocks_per_sm, max_threads_per_sm / FIL_TPB);
     CUDA_CHECK(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device));
     fixed_block_count_ = blocks_per_sm * sm_count;
   }
 
   void init_common(const raft::handle_t& h,
-                   const forest_params_t* params,
-                   const std::vector<float>& vector_leaf)
+                   const categorical_sets& cat_sets,
+                   const std::vector<float>& vector_leaf,
+                   const forest_params_t* params)
   {
     depth_                           = params->depth;
     num_trees_                       = params->num_trees;
@@ -155,23 +173,27 @@ struct forest {
     proba_ssp_.leaf_algo             = params->leaf_algo;
     proba_ssp_.num_cols              = params->num_cols;
     proba_ssp_.num_classes           = params->num_classes;
+    proba_ssp_.cats_present          = cat_sets.cats_present();
     class_ssp_                       = proba_ssp_;
 
-    int device = h.get_device();
+    int device          = h.get_device();
+    cudaStream_t stream = h.get_stream();
     init_n_items(device);  // n_items takes priority over blocks_per_sm
     init_fixed_block_count(device, params->blocks_per_sm);
 
     // vector leaf
     if (!vector_leaf.empty()) {
-      vector_leaf_len_ = vector_leaf.size();
-      vector_leaf_.resize(vector_leaf.size(), h.get_stream());
+      vector_leaf_.resize(vector_leaf.size(), stream);
 
       CUDA_CHECK(cudaMemcpyAsync(vector_leaf_.data(),
                                  vector_leaf.data(),
                                  vector_leaf.size() * sizeof(float),
                                  cudaMemcpyHostToDevice,
-                                 h.get_stream()));
+                                 stream));
     }
+
+    // categorical features
+    cat_sets_ = cat_sets_device_owner(cat_sets, stream);
   }
 
   virtual void infer(predict_params params, cudaStream_t stream) = 0;
@@ -285,7 +307,7 @@ struct forest {
           params.num_outputs = params.num_classes;
           do_transform = (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != 0.0f;
           break;
-        default: ASSERT(false, "internal error: invalid leaf_algo_");
+        default: ASSERT(false, "internal error: predict: invalid leaf_algo %d", params.leaf_algo);
       }
     } else {
       if (params.leaf_algo == leaf_algo_t::FLOAT_UNARY_BINARY) {
@@ -316,7 +338,11 @@ struct forest {
     }
   }
 
-  virtual void free(const raft::handle_t& h) { vector_leaf_.release(); }
+  virtual void free(const raft::handle_t& h)
+  {
+    cat_sets_.release();
+    vector_leaf_.release();
+  }
 
   virtual ~forest() {}
 
@@ -330,7 +356,7 @@ struct forest {
   int fixed_block_count_ = 0;
   // Optionally used
   rmm::device_uvector<float> vector_leaf_;
-  size_t vector_leaf_len_ = 0;
+  cat_sets_device_owner cat_sets_;
 };
 
 struct dense_forest : forest {
@@ -363,11 +389,12 @@ struct dense_forest : forest {
   }
 
   void init(const raft::handle_t& h,
+            const categorical_sets& cat_sets,
+            const std::vector<float>& vector_leaf,
             const dense_node* nodes,
-            const forest_params_t* params,
-            const std::vector<float>& vector_leaf)
+            const forest_params_t* params)
   {
-    init_common(h, params, vector_leaf);
+    init_common(h, cat_sets, vector_leaf, params);
     if (algo_ == algo_t::NAIVE) algo_ = algo_t::BATCH_TREE_REORG;
 
     int num_nodes = forest_num_nodes(num_trees_, depth_);
@@ -391,11 +418,12 @@ struct dense_forest : forest {
 
   virtual void infer(predict_params params, cudaStream_t stream) override
   {
-    dense_storage forest(nodes_.data(),
+    dense_storage forest(cat_sets_.accessor(),
+                         vector_leaf_.data(),
+                         nodes_.data(),
                          num_trees_,
                          algo_ == algo_t::NAIVE ? tree_num_nodes(depth_) : 1,
-                         algo_ == algo_t::NAIVE ? 1 : num_trees_,
-                         vector_leaf_.data());
+                         algo_ == algo_t::NAIVE ? 1 : num_trees_);
     fil::infer(forest, params, stream);
   }
 
@@ -417,12 +445,13 @@ struct sparse_forest : forest {
   }
 
   void init(const raft::handle_t& h,
+            const categorical_sets& cat_sets,
+            const std::vector<float>& vector_leaf,
             const int* trees,
             const node_t* nodes,
-            const forest_params_t* params,
-            const std::vector<float>& vector_leaf)
+            const forest_params_t* params)
   {
-    init_common(h, params, vector_leaf);
+    init_common(h, cat_sets, vector_leaf, params);
     if (algo_ == algo_t::ALGO_AUTO) algo_ = algo_t::NAIVE;
     depth_     = 0;  // a placeholder value
     num_nodes_ = params->num_nodes;
@@ -440,7 +469,8 @@ struct sparse_forest : forest {
 
   virtual void infer(predict_params params, cudaStream_t stream) override
   {
-    sparse_storage<node_t> forest(trees_.data(), nodes_.data(), num_trees_, vector_leaf_.data());
+    sparse_storage<node_t> forest(
+      cat_sets_.accessor(), vector_leaf_.data(), trees_.data(), nodes_.data(), num_trees_);
     fil::infer(forest, params, stream);
   }
 
@@ -578,13 +608,107 @@ int max_depth(const tl::ModelImpl<T, L>& model)
   return depth;
 }
 
-inline void adjust_threshold(
+void elementwise_combine(std::vector<cat_feature_counters>& dst,
+                         const std::vector<cat_feature_counters>& extra)
+{
+  std::transform(dst.begin(), dst.end(), extra.begin(), dst.begin(), cat_feature_counters::combine);
+}
+
+// constructs a vector of size n_cols (number of features, or columns) from a Treelite tree,
+// where each feature has a maximum matching category and node count (from this tree alone).
+template <typename T, typename L>
+inline std::vector<cat_feature_counters> cat_counter_vec(const tl::Tree<T, L>& tree, int n_cols)
+{
+  std::vector<cat_feature_counters> res(n_cols);
+  std::stack<int> stack;
+  stack.push(tree_root(tree));
+  while (!stack.empty()) {
+    int node_id = stack.top();
+    stack.pop();
+    while (!tree.IsLeaf(node_id)) {
+      if (tree.SplitType(node_id) == tl::SplitFeatureType::kCategorical) {
+        std::vector<std::uint32_t> mmv = tree.MatchingCategories(node_id);
+        int max_matching_cat;
+        if (mmv.size() > 0) {
+          // in `struct cat_feature_counters` and GPU structures, max matching category is an int
+          // cast is safe because all precise int floats fit into ints, which are asserted to be 32
+          // bits
+          max_matching_cat = mmv.back();
+          ASSERT(max_matching_cat <= MAX_PRECISE_INT_FLOAT,
+                 "FIL cannot infer on "
+                 "more than %d matching categories",
+                 MAX_PRECISE_INT_FLOAT);
+        } else {
+          max_matching_cat = -1;
+        }
+        cat_feature_counters& counters = res[tree.SplitIndex(node_id)];
+        counters =
+          cat_feature_counters::combine(counters, cat_feature_counters{max_matching_cat, 1});
+      }
+      stack.push(tree.LeftChild(node_id));
+      node_id = tree.RightChild(node_id);
+    }
+  }
+  return res;
+}
+
+// computes overall categorical bit pool size for a tree imported from the Treelite tree
+template <typename T, typename L>
+inline std::size_t bit_pool_size(const tl::Tree<T, L>& tree, const categorical_sets& cat_sets)
+{
+  std::size_t size = 0;
+  std::stack<int> stack;
+  stack.push(tree_root(tree));
+  while (!stack.empty()) {
+    int node_id = stack.top();
+    stack.pop();
+    while (!tree.IsLeaf(node_id)) {
+      if (tree.SplitType(node_id) == tl::SplitFeatureType::kCategorical) {
+        int fid = tree.SplitIndex(node_id);
+        size += cat_sets.sizeof_mask(fid);
+      }
+      stack.push(tree.LeftChild(node_id));
+      node_id = tree.RightChild(node_id);
+    }
+  }
+  return size;
+}
+
+template <typename T, typename L>
+cat_sets_owner allocate_cat_sets_owner(const tl::ModelImpl<T, L>& model)
+{
+#pragma omp declare reduction(cat_counter_vec_red : std::vector<cat_feature_counters> \
+      : elementwise_combine(omp_out, omp_in))                 \
+    initializer(omp_priv = omp_orig)
+  const auto& trees = model.trees;
+  cat_sets_owner cat_sets;
+  std::vector<cat_feature_counters> counters(model.num_feature);
+#pragma omp parallel for reduction(cat_counter_vec_red : counters)
+  for (std::size_t i = 0; i < trees.size(); ++i) {
+    elementwise_combine(counters, cat_counter_vec(trees[i], model.num_feature));
+  }
+  cat_sets.consume_counters(counters);
+  std::vector<std::size_t> bit_pool_sizes(trees.size());
+#pragma omp parallel for
+  for (std::size_t i = 0; i < trees.size(); ++i) {
+    bit_pool_sizes[i] = bit_pool_size(trees[i], cat_sets.accessor());
+  }
+  cat_sets.consume_bit_pool_sizes(bit_pool_sizes);
+  return cat_sets;
+}
+
+void adjust_threshold(
   float* pthreshold, int* tl_left, int* tl_right, bool* default_left, tl::Operator comparison_op)
 {
   // in treelite (take left node if val [op] threshold),
   // the meaning of the condition is reversed compared to FIL;
   // thus, "<" in treelite corresonds to comparison ">=" used by FIL
   // https://github.com/dmlc/treelite/blob/master/include/treelite/tree.h#L243
+  if (isnan(*pthreshold)) {
+    std::swap(*tl_left, *tl_right);
+    *default_left = !*default_left;
+    return;
+  }
   switch (comparison_op) {
     case tl::Operator::kLT: break;
     case tl::Operator::kLE:
@@ -662,6 +786,56 @@ void tl2fil_leaf_payload(fil_node_t* fil_node,
   };
 }
 
+template <typename fil_node_t>
+struct conversion_state {
+  fil_node_t node;
+  int tl_left;
+  int tl_right;
+};
+
+// modifies cat_sets
+template <typename fil_node_t, typename T, typename L>
+conversion_state<fil_node_t> tl2fil_inner_node(int fil_left_child,
+                                               const tl::Tree<T, L>& tree,
+                                               int tl_node_id,
+                                               const forest_params_t& forest_params,
+                                               cat_sets_owner* cat_sets,
+                                               std::size_t* bit_pool_offset)
+{
+  int tl_left = tree.LeftChild(tl_node_id), tl_right = tree.RightChild(tl_node_id);
+  val_t split         = {.f = NAN};  // yes there's a default initializer already
+  int feature_id      = tree.SplitIndex(tl_node_id);
+  bool is_categorical = tree.SplitType(tl_node_id) == tl::SplitFeatureType::kCategorical;
+  bool default_left   = tree.DefaultLeft(tl_node_id);
+  if (tree.SplitType(tl_node_id) == tl::SplitFeatureType::kNumerical) {
+    split.f = static_cast<float>(tree.Threshold(tl_node_id));
+    adjust_threshold(&split.f, &tl_left, &tl_right, &default_left, tree.ComparisonOp(tl_node_id));
+  } else if (tree.SplitType(tl_node_id) == tl::SplitFeatureType::kCategorical) {
+    // for FIL, the list of categories is always for the right child
+    if (!tree.CategoriesListRightChild(tl_node_id)) {
+      std::swap(tl_left, tl_right);
+      default_left = !default_left;
+    }
+    int sizeof_mask = cat_sets->accessor().sizeof_mask(feature_id);
+    split.idx       = *bit_pool_offset;
+    *bit_pool_offset += sizeof_mask;
+    // cat_sets->bits have been zero-initialized
+    uint8_t* bits = &cat_sets->bits[split.idx];
+    for (std::uint32_t category : tree.MatchingCategories(tl_node_id)) {
+      bits[category / BITS_PER_BYTE] |= 1 << (category % BITS_PER_BYTE);
+    }
+  } else {
+    ASSERT(false, "only numerical and categorical split nodes are supported");
+  }
+  fil_node_t node;
+  if constexpr (std::is_same<fil_node_t, dense_node>()) {
+    node = fil_node_t({}, split, feature_id, default_left, false, is_categorical);
+  } else {
+    node = fil_node_t({}, split, feature_id, default_left, false, is_categorical, fil_left_child);
+  }
+  return conversion_state<fil_node_t>{node, tl_left, tl_right};
+}
+
 template <typename T, typename L>
 void node2fil_dense(std::vector<dense_node>* pnodes,
                     int root,
@@ -670,47 +844,75 @@ void node2fil_dense(std::vector<dense_node>* pnodes,
                     int node_id,
                     const forest_params_t& forest_params,
                     std::vector<float>* vector_leaf,
-                    size_t* leaf_counter)
+                    std::size_t* leaf_counter,
+                    cat_sets_owner* cat_sets,
+                    std::size_t* bit_pool_offset)
 {
   if (tree.IsLeaf(node_id)) {
-    (*pnodes)[root + cur] = dense_node(val_t{.f = NAN}, NAN, 0, false, true);
+    (*pnodes)[root + cur] = dense_node({}, {}, 0, false, true, false);
     tl2fil_leaf_payload(
       &(*pnodes)[root + cur], root + cur, tree, node_id, forest_params, vector_leaf, leaf_counter);
     return;
   }
 
   // inner node
-  ASSERT(tree.SplitType(node_id) == tl::SplitFeatureType::kNumerical,
-         "only numerical split nodes are supported");
-  int tl_left = tree.LeftChild(node_id), tl_right = tree.RightChild(node_id);
-  bool default_left = tree.DefaultLeft(node_id);
-  float threshold   = static_cast<float>(tree.Threshold(node_id));
-  adjust_threshold(&threshold, &tl_left, &tl_right, &default_left, tree.ComparisonOp(node_id));
-  (*pnodes)[root + cur] =
-    dense_node(val_t{.f = 0}, threshold, tree.SplitIndex(node_id), default_left, false);
   int left = 2 * cur + 1;
-  node2fil_dense(pnodes, root, left, tree, tl_left, forest_params, vector_leaf, leaf_counter);
-  node2fil_dense(pnodes, root, left + 1, tree, tl_right, forest_params, vector_leaf, leaf_counter);
+  conversion_state<dense_node> cs =
+    tl2fil_inner_node<dense_node>(left, tree, node_id, forest_params, cat_sets, bit_pool_offset);
+  (*pnodes)[root + cur] = cs.node;
+  node2fil_dense(pnodes,
+                 root,
+                 left,
+                 tree,
+                 cs.tl_left,
+                 forest_params,
+                 vector_leaf,
+                 leaf_counter,
+                 cat_sets,
+                 bit_pool_offset);
+  node2fil_dense(pnodes,
+                 root,
+                 left + 1,
+                 tree,
+                 cs.tl_right,
+                 forest_params,
+                 vector_leaf,
+                 leaf_counter,
+                 cat_sets,
+                 bit_pool_offset);
 }
 
 template <typename T, typename L>
 void tree2fil_dense(std::vector<dense_node>* pnodes,
                     int root,
                     const tl::Tree<T, L>& tree,
+                    std::size_t tree_idx,
                     const forest_params_t& forest_params,
                     std::vector<float>* vector_leaf,
-                    size_t* leaf_counter)
+                    std::size_t* leaf_counter,
+                    cat_sets_owner* cat_sets)
 {
-  node2fil_dense(pnodes, root, 0, tree, tree_root(tree), forest_params, vector_leaf, leaf_counter);
+  node2fil_dense(pnodes,
+                 root,
+                 0,
+                 tree,
+                 tree_root(tree),
+                 forest_params,
+                 vector_leaf,
+                 leaf_counter,
+                 cat_sets,
+                 &cat_sets->bit_pool_offsets[tree_idx]);
 }
 
 template <typename fil_node_t, typename T, typename L>
 int tree2fil_sparse(std::vector<fil_node_t>& nodes,
                     int root,
                     const tl::Tree<T, L>& tree,
+                    std::size_t tree_idx,
                     const forest_params_t& forest_params,
                     std::vector<float>* vector_leaf,
-                    size_t* leaf_counter)
+                    std::size_t* leaf_counter,
+                    cat_sets_owner* cat_sets)
 {
   typedef std::pair<int, int> pair_t;
   std::stack<pair_t> stack;
@@ -723,33 +925,23 @@ int tree2fil_sparse(std::vector<fil_node_t>& nodes,
     stack.pop();
 
     while (!tree.IsLeaf(node_id)) {
-      // inner node
-      ASSERT(tree.SplitType(node_id) == tl::SplitFeatureType::kNumerical,
-             "only numerical split nodes are supported");
-      // tl_left and tl_right are indices of the children in the treelite tree
-      // (stored  as an array of nodes)
-      int tl_left = tree.LeftChild(node_id), tl_right = tree.RightChild(node_id);
-      bool default_left = tree.DefaultLeft(node_id);
-      float threshold   = static_cast<float>(tree.Threshold(node_id));
-      adjust_threshold(&threshold, &tl_left, &tl_right, &default_left, tree.ComparisonOp(node_id));
-
       // reserve space for child nodes
       // left is the offset of the left child node relative to the tree root
       // in the array of all nodes of the FIL sparse forest
       int left = built_index - root;
       built_index += 2;
-      nodes[root + cur] =
-        fil_node_t(val_t{.f = 0}, threshold, tree.SplitIndex(node_id), default_left, false, left);
-
+      conversion_state<fil_node_t> cs = tl2fil_inner_node<fil_node_t>(
+        left, tree, node_id, forest_params, cat_sets, &cat_sets->bit_pool_offsets[tree_idx]);
+      nodes[root + cur] = cs.node;
       // push child nodes into the stack
-      stack.push(pair_t(tl_right, left + 1));
+      stack.push(pair_t(cs.tl_right, left + 1));
       // stack.push(pair_t(tl_left, left));
-      node_id = tl_left;
+      node_id = cs.tl_left;
       cur     = left;
     }
 
     // leaf node
-    nodes[root + cur] = fil_node_t(val_t{.f = NAN}, NAN, 0, false, true, 0);
+    nodes[root + cur] = fil_node_t({}, {}, 0, false, true, false, 0);
     tl2fil_leaf_payload(
       &nodes[root + cur], root + cur, tree, node_id, forest_params, vector_leaf, leaf_counter);
   }
@@ -922,6 +1114,7 @@ void tl2fil_dense(std::vector<dense_node>* pnodes,
                   forest_params_t* params,
                   const tl::ModelImpl<threshold_t, leaf_t>& model,
                   const treelite_params_t* tl_params,
+                  cat_sets_owner* cat_sets,
                   std::vector<float>* vector_leaf)
 {
   tl2fil_common(params, model, tl_params);
@@ -932,15 +1125,18 @@ void tl2fil_dense(std::vector<dense_node>* pnodes,
   if (params->leaf_algo == VECTOR_LEAF) {
     vector_leaf->resize(max_leaves_per_tree * params->num_trees * params->num_classes);
   }
+  *cat_sets = allocate_cat_sets_owner(model);
   pnodes->resize(num_nodes, dense_node());
   for (std::size_t i = 0; i < model.trees.size(); ++i) {
     size_t leaf_counter = max_leaves_per_tree * i;
     tree2fil_dense(pnodes,
                    i * tree_num_nodes(params->depth),
                    model.trees[i],
+                   i,
                    *params,
                    vector_leaf,
-                   &leaf_counter);
+                   &leaf_counter,
+                   cat_sets);
   }
 }
 
@@ -984,7 +1180,7 @@ struct tl2fil_sparse_check_t<sparse_node8> {
     for (std::size_t i = 0; i < trees.size(); ++i) {
       int num_nodes = trees[i].num_nodes;
       ASSERT(num_nodes <= MAX_TREE_NODES,
-             "tree %lu has %d nodes, "
+             "tree %zu has %d nodes, "
              "but only %d supported for 8-byte sparse nodes",
              i,
              num_nodes,
@@ -1001,6 +1197,7 @@ void tl2fil_sparse(std::vector<int>* ptrees,
                    forest_params_t* params,
                    const tl::ModelImpl<threshold_t, leaf_t>& model,
                    const treelite_params_t* tl_params,
+                   cat_sets_owner* cat_sets,
                    std::vector<float>* vector_leaf)
 {
   tl2fil_common(params, model, tl_params);
@@ -1020,14 +1217,16 @@ void tl2fil_sparse(std::vector<int>* ptrees,
     vector_leaf->resize(max_leaves * params->num_classes);
   }
 
+  *cat_sets = allocate_cat_sets_owner(model);
   pnodes->resize(total_nodes);
 
-  // convert the nodes
+// convert the nodes
 #pragma omp parallel for
   for (std::size_t i = 0; i < num_trees; ++i) {
     // Max number of leaves processed so far
     size_t leaf_counter = ((*ptrees)[i] + i) / 2;
-    tree2fil_sparse(*pnodes, (*ptrees)[i], model.trees[i], *params, vector_leaf, &leaf_counter);
+    tree2fil_sparse(
+      *pnodes, (*ptrees)[i], model.trees[i], i, *params, vector_leaf, &leaf_counter, cat_sets);
   }
 
   params->num_nodes = pnodes->size();
@@ -1035,44 +1234,48 @@ void tl2fil_sparse(std::vector<int>* ptrees,
 
 void init_dense(const raft::handle_t& h,
                 forest_t* pf,
+                const categorical_sets& cat_sets,
+                const std::vector<float>& vector_leaf,
                 const dense_node* nodes,
-                const forest_params_t* params,
-                const std::vector<float>& vector_leaf)
+                const forest_params_t* params)
 {
   check_params(params, true);
   dense_forest* f = new dense_forest(h);
-  f->init(h, nodes, params, vector_leaf);
+  f->init(h, cat_sets, vector_leaf, nodes, params);
   *pf = f;
 }
 
 template <typename fil_node_t>
 void init_sparse(const raft::handle_t& h,
                  forest_t* pf,
+                 const categorical_sets& cat_sets,
+                 const std::vector<float>& vector_leaf,
                  const int* trees,
                  const fil_node_t* nodes,
-                 const forest_params_t* params,
-                 const std::vector<float>& vector_leaf)
+                 const forest_params_t* params)
 {
   check_params(params, false);
   sparse_forest<fil_node_t>* f = new sparse_forest<fil_node_t>(h);
-  f->init(h, trees, nodes, params, vector_leaf);
+  f->init(h, cat_sets, vector_leaf, trees, nodes, params);
   *pf = f;
 }
 
 // explicit instantiations for init_sparse()
 template void init_sparse<sparse_node16>(const raft::handle_t& h,
                                          forest_t* pf,
+                                         const categorical_sets& cat_sets,
+                                         const std::vector<float>& vector_leaf,
                                          const int* trees,
                                          const sparse_node16* nodes,
-                                         const forest_params_t* params,
-                                         const std::vector<float>& vector_leaf);
+                                         const forest_params_t* params);
 
 template void init_sparse<sparse_node8>(const raft::handle_t& h,
                                         forest_t* pf,
+                                        const categorical_sets& cat_sets,
+                                        const std::vector<float>& vector_leaf,
                                         const int* trees,
                                         const sparse_node8* nodes,
-                                        const forest_params_t* params,
-                                        const std::vector<float>& vector_leaf);
+                                        const forest_params_t* params);
 
 template <typename threshold_t, typename leaf_t>
 void from_treelite(const raft::handle_t& handle,
@@ -1099,7 +1302,8 @@ void from_treelite(const raft::handle_t& handle,
   if (storage_type == storage_type_t::AUTO) {
     if (tl_params->algo == algo_t::ALGO_AUTO || tl_params->algo == algo_t::NAIVE) {
       int depth = max_depth(model);
-      // max 2**25 dense nodes, 256 MiB dense model size
+      // max 2**25 dense nodes, 256 MiB dense model size. Categorical mask size is unlimited and not
+      // affected by storage format.
       const int LOG2_MAX_DENSE_NODES = 25;
       int log2_num_dense_nodes       = depth + 1 + int(ceil(std::log2(model.trees.size())));
       storage_type = log2_num_dense_nodes > LOG2_MAX_DENSE_NODES ? storage_type_t::SPARSE
@@ -1111,17 +1315,18 @@ void from_treelite(const raft::handle_t& handle,
   }
 
   forest_params_t params;
+  cat_sets_owner cat_sets;
   switch (storage_type) {
     case storage_type_t::DENSE: {
       std::vector<dense_node> nodes;
       std::vector<float> vector_leaf;
-      tl2fil_dense(&nodes, &params, model, tl_params, &vector_leaf);
-      init_dense(handle, pforest, nodes.data(), &params, vector_leaf);
+      tl2fil_dense(&nodes, &params, model, tl_params, &cat_sets, &vector_leaf);
+      init_dense(handle, pforest, cat_sets.accessor(), vector_leaf, nodes.data(), &params);
       // sync is necessary as nodes is used in init_dense(),
       // but destructed at the end of this function
       CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
       if (tl_params->pforest_shape_str) {
-        *tl_params->pforest_shape_str = sprintf_shape(model, storage_type, nodes, {});
+        *tl_params->pforest_shape_str = sprintf_shape(model, storage_type, nodes, {}, cat_sets);
       }
       break;
     }
@@ -1129,11 +1334,12 @@ void from_treelite(const raft::handle_t& handle,
       std::vector<int> trees;
       std::vector<sparse_node16> nodes;
       std::vector<float> vector_leaf;
-      tl2fil_sparse(&trees, &nodes, &params, model, tl_params, &vector_leaf);
-      init_sparse(handle, pforest, trees.data(), nodes.data(), &params, vector_leaf);
+      tl2fil_sparse(&trees, &nodes, &params, model, tl_params, &cat_sets, &vector_leaf);
+      init_sparse(
+        handle, pforest, cat_sets.accessor(), vector_leaf, trees.data(), nodes.data(), &params);
       CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
       if (tl_params->pforest_shape_str) {
-        *tl_params->pforest_shape_str = sprintf_shape(model, storage_type, nodes, trees);
+        *tl_params->pforest_shape_str = sprintf_shape(model, storage_type, nodes, trees, cat_sets);
       }
       break;
     }
@@ -1141,11 +1347,12 @@ void from_treelite(const raft::handle_t& handle,
       std::vector<int> trees;
       std::vector<sparse_node8> nodes;
       std::vector<float> vector_leaf;
-      tl2fil_sparse(&trees, &nodes, &params, model, tl_params, &vector_leaf);
-      init_sparse(handle, pforest, trees.data(), nodes.data(), &params, vector_leaf);
+      tl2fil_sparse(&trees, &nodes, &params, model, tl_params, &cat_sets, &vector_leaf);
+      init_sparse(
+        handle, pforest, cat_sets.accessor(), vector_leaf, trees.data(), nodes.data(), &params);
       CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
       if (tl_params->pforest_shape_str) {
-        *tl_params->pforest_shape_str = sprintf_shape(model, storage_type, nodes, trees);
+        *tl_params->pforest_shape_str = sprintf_shape(model, storage_type, nodes, trees, cat_sets);
       }
       break;
     }
@@ -1170,13 +1377,28 @@ template <typename threshold_t, typename leaf_t, typename node_t>
 char* sprintf_shape(const tl::ModelImpl<threshold_t, leaf_t>& model,
                     storage_type_t storage,
                     const std::vector<node_t>& nodes,
-                    const std::vector<int>& trees)
+                    const std::vector<int>& trees,
+                    const cat_sets_owner cat_sets)
 {
   std::stringstream forest_shape = depth_hist_and_max(model);
-  float size_mb =
-    (trees.size() * sizeof(trees.front()) + nodes.size() * sizeof(nodes.front())) / 1e6;
+  double size_mb = (trees.size() * sizeof(trees.front()) + nodes.size() * sizeof(nodes.front()) +
+                    cat_sets.bits.size()) /
+                   1e6;
   forest_shape << storage_type_repr[storage] << " model size " << std::setprecision(2) << size_mb
                << " MB" << std::endl;
+  if (cat_sets.bits.size() > 0) {
+    forest_shape << "number of categorical nodes for each feature id: {";
+    std::size_t total_cat_nodes = 0;
+    for (std::size_t n : cat_sets.n_nodes) {
+      forest_shape << n << " ";
+      total_cat_nodes += n;
+    }
+    forest_shape << "}" << std::endl << "total categorical nodes: " << total_cat_nodes << std::endl;
+    forest_shape << "maximum matching category for each feature id: {";
+    for (int mm : cat_sets.max_matching)
+      forest_shape << mm << " ";
+    forest_shape << "}" << std::endl;
+  }
   // stream may be discontiguous
   std::string forest_shape_str = forest_shape.str();
   // now copy to a non-owning allocation
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 7b99dfb1db..0f709db5ea 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -21,6 +21,7 @@
 #include <cuml/fil/multi_sum.cuh>
 
 #include <raft/cudart_utils.h>
+#include <raft/cuda_utils.cuh>
 
 #include <thrust/functional.h>
 
@@ -152,7 +153,7 @@ __device__ __forceinline__ vec<NITEMS, output_type> tree_leaf_output(tree_type t
   return out;
 }
 
-template <int NITEMS, typename output_type, typename tree_type>
+template <int NITEMS, bool CATS_SUPPORTED, typename output_type, typename tree_type>
 __device__ __forceinline__ vec<NITEMS, output_type> infer_one_tree(tree_type tree,
                                                                    const float* input,
                                                                    int cols,
@@ -170,9 +171,7 @@ __device__ __forceinline__ vec<NITEMS, output_type> infer_one_tree(tree_type tre
       auto n = tree[curr[j]];
       mask &= ~(n.is_leaf() << j);
       if ((mask & (1 << j)) != 0) {
-        float val = input[j * cols + n.fid()];
-        bool cond = isnan(val) ? !n.def_left() : val >= n.thresh();
-        curr[j]   = n.left(curr[j]) + cond;
+        curr[j] = tree.child_index<CATS_SUPPORTED>(n, curr[j], input[j * cols + n.fid()]);
       }
     }
   } while (mask != 0);
@@ -195,8 +194,7 @@ __device__ __forceinline__ vec<1, output_type> infer_one_tree(tree_type tree,
   for (;;) {
     auto n = tree[curr];
     if (n.is_leaf()) break;
-    float val = input[n.fid()];
-    bool cond = isnan(val) ? !n.def_left() : val >= n.thresh();
+    bool cond = tree.child_index<true>(n, curr, input[n.fid()]);
     curr      = n.left(curr) + cond;
   }
   vec<1, output_type> out;
@@ -785,7 +783,11 @@ __device__ INLINE_CONFIG void load_data(float* sdata,
     sdata[idx] = 0.0f;
 }
 
-template <int NITEMS, leaf_algo_t leaf_algo, bool cols_in_shmem, class storage_type>
+template <int NITEMS,
+          leaf_algo_t leaf_algo,
+          bool cols_in_shmem,
+          bool CATS_SUPPORTED,
+          class storage_type>
 __global__ void infer_k(storage_type forest, predict_params params)
 {
   extern __shared__ char smem[];
@@ -821,7 +823,7 @@ __global__ void infer_k(storage_type forest, predict_params params)
       typedef typename leaf_output_t<leaf_algo>::T pred_t;
       vec<NITEMS, pred_t> prediction;
       if (tree < forest.num_trees() && thread_num_rows != 0) {
-        prediction = infer_one_tree<NITEMS, pred_t>(
+        prediction = infer_one_tree<NITEMS, CATS_SUPPORTED, pred_t>(
           forest[tree],
           cols_in_shmem ? sdata + thread_row0 * sdata_stride : block_input + thread_row0 * num_cols,
           cols_in_shmem ? sdata_stride : num_cols,
@@ -849,101 +851,46 @@ size_t shmem_size_params::get_smem_footprint()
   size_t accumulate_footprint =
     tree_aggregator_t<NITEMS, leaf_algo>::smem_accumulate_footprint(num_classes) +
     cols_shmem_size();
-
   return std::max(accumulate_footprint, finalize_footprint);
 }
 
-template <int NITEMS>
-size_t shmem_size_params::get_smem_footprint()
+template <class KernelParams>
+int compute_smem_footprint::run(predict_params ssp)
 {
-  switch (leaf_algo) {
-    case FLOAT_UNARY_BINARY: return get_smem_footprint<NITEMS, FLOAT_UNARY_BINARY>();
-    case CATEGORICAL_LEAF: return get_smem_footprint<NITEMS, CATEGORICAL_LEAF>();
-    case GROVE_PER_CLASS:
-      if (num_classes > FIL_TPB) return get_smem_footprint<NITEMS, GROVE_PER_CLASS_MANY_CLASSES>();
-      return get_smem_footprint<NITEMS, GROVE_PER_CLASS_FEW_CLASSES>();
-    case VECTOR_LEAF: return get_smem_footprint<NITEMS, VECTOR_LEAF>();
-    default: ASSERT(false, "internal error: unexpected leaf_algo_t");
-  }
+  return ssp.template get_smem_footprint<KernelParams::N_ITEMS, KernelParams::LEAF_ALGO>();
 }
 
-void shmem_size_params::compute_smem_footprint()
-{
-  switch (n_items) {
-    case 1: shm_sz = get_smem_footprint<1>(); break;
-    case 2: shm_sz = get_smem_footprint<2>(); break;
-    case 3: shm_sz = get_smem_footprint<3>(); break;
-    case 4: shm_sz = get_smem_footprint<4>(); break;
-    default: ASSERT(false, "internal error: n_items > 4");
-  }
-}
+// make sure to instantiate all possible get_smem_footprint instantiations
+template int dispatch_on_fil_template_params(compute_smem_footprint, predict_params);
 
-template <leaf_algo_t leaf_algo, bool cols_in_shmem, typename storage_type>
-void infer_k_nitems_launcher(storage_type forest,
-                             predict_params params,
-                             cudaStream_t stream,
-                             int block_dim_x)
-{
-  switch (params.n_items) {
-    case 1:
-      infer_k<1, leaf_algo, cols_in_shmem>
-        <<<params.num_blocks, block_dim_x, params.shm_sz, stream>>>(forest, params);
-      break;
-    case 2:
-      infer_k<2, leaf_algo, cols_in_shmem>
-        <<<params.num_blocks, block_dim_x, params.shm_sz, stream>>>(forest, params);
-      break;
-    case 3:
-      infer_k<3, leaf_algo, cols_in_shmem>
-        <<<params.num_blocks, block_dim_x, params.shm_sz, stream>>>(forest, params);
-      break;
-    case 4:
-      infer_k<4, leaf_algo, cols_in_shmem>
-        <<<params.num_blocks, block_dim_x, params.shm_sz, stream>>>(forest, params);
-      break;
-    default: ASSERT(false, "internal error: nitems > 4");
-  }
-  CUDA_CHECK(cudaPeekAtLastError());
-}
+template <typename storage_type>
+struct infer_k_storage_template : dispatch_functor<void> {
+  storage_type forest;
+  cudaStream_t stream;
+  infer_k_storage_template(storage_type forest_, cudaStream_t stream_)
+    : forest(forest_), stream(stream_)
+  {
+  }
 
-template <leaf_algo_t leaf_algo, typename storage_type>
-void infer_k_launcher(storage_type forest,
-                      predict_params params,
-                      cudaStream_t stream,
-                      int blockdim_x)
-{
-  params.num_blocks = params.num_blocks != 0 ? params.num_blocks
-                                             : raft::ceildiv(int(params.num_rows), params.n_items);
-  if (params.cols_in_shmem) {
-    infer_k_nitems_launcher<leaf_algo, true>(forest, params, stream, blockdim_x);
-  } else {
-    infer_k_nitems_launcher<leaf_algo, false>(forest, params, stream, blockdim_x);
+  template <class KernelParams = KernelTemplateParams<>>
+  void run(predict_params params)
+  {
+    params.num_blocks = params.num_blocks != 0
+                          ? params.num_blocks
+                          : raft::ceildiv(int(params.num_rows), params.n_items);
+    infer_k<KernelParams::N_ITEMS,
+            KernelParams::LEAF_ALGO,
+            KernelParams::COLS_IN_SHMEM,
+            KernelParams::CATS_SUPPORTED>
+      <<<params.num_blocks, params.block_dim_x, params.shm_sz, stream>>>(forest, params);
+    CUDA_CHECK(cudaPeekAtLastError());
   }
-}
+};
 
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream)
 {
-  switch (params.leaf_algo) {
-    case FLOAT_UNARY_BINARY:
-      infer_k_launcher<FLOAT_UNARY_BINARY>(forest, params, stream, FIL_TPB);
-      break;
-    case GROVE_PER_CLASS:
-      if (params.num_classes > FIL_TPB) {
-        params.leaf_algo = GROVE_PER_CLASS_MANY_CLASSES;
-        infer_k_launcher<GROVE_PER_CLASS_MANY_CLASSES>(forest, params, stream, FIL_TPB);
-      } else {
-        params.leaf_algo = GROVE_PER_CLASS_FEW_CLASSES;
-        infer_k_launcher<GROVE_PER_CLASS_FEW_CLASSES>(
-          forest, params, stream, FIL_TPB - FIL_TPB % params.num_classes);
-      }
-      break;
-    case CATEGORICAL_LEAF:
-      infer_k_launcher<CATEGORICAL_LEAF>(forest, params, stream, FIL_TPB);
-      break;
-    case VECTOR_LEAF: infer_k_launcher<VECTOR_LEAF>(forest, params, stream, FIL_TPB); break;
-    default: ASSERT(false, "internal error: invalid leaf_algo");
-  }
+  dispatch_on_fil_template_params(infer_k_storage_template<storage_type>(forest, stream), params);
 }
 
 template void infer<dense_storage>(dense_storage forest,
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index ed5b93dd41..1d182fc8f5 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -18,6 +18,15 @@
 
 #pragma once
 #include <cuml/fil/fil.h>
+#include <treelite/c_api.h>
+#include <treelite/tree.h>
+#include <bitset>
+#include <cstdint>
+#include <iostream>
+#include <numeric>
+#include <raft/cuda_utils.cuh>
+#include <raft/error.hpp>
+#include <rmm/device_uvector.hpp>
 #include <vector>
 
 namespace raft {
@@ -27,6 +36,8 @@ class handle_t;
 namespace ML {
 namespace fil {
 
+const int BITS_PER_BYTE = 8;
+
 /// modpow2 returns a % b == a % pow(2, log2_b)
 __host__ __device__ __forceinline__ int modpow2(int a, int log2_b)
 {
@@ -73,51 +84,73 @@ enum output_t {
 
 /** val_t is the payload within a FIL leaf */
 union val_t {
-  /** threshold value for branch node or output value (e.g. class
+  /** threshold value for parent node or output value (e.g. class
       probability or regression summand) for leaf node */
-  float f;
-  /** class label */
+  float f = NAN;
+  /** class label, leaf vector index or categorical node set offset */
   int idx;
 };
 
 /** base_node contains common implementation details for dense and sparse nodes */
 struct base_node {
-  /** val is either the threshold (for inner nodes, always float)
-      or the tree prediction (for leaf nodes) */
+  /** val, for parent nodes, is a threshold or category list offset. For leaf
+      nodes, it is the tree prediction (see see leaf_output_t<leaf_algo_t>::T) */
   val_t val;
   /** bits encode various information about the node, with the exact nature of
       this information depending on the node type; it includes e.g. whether the
       node is a leaf or inner node, and for inner nodes, additional information,
       e.g. the default direction, feature id or child index */
   int bits;
-  static const int FID_MASK      = (1 << 30) - 1;
-  static const int DEF_LEFT_MASK = 1 << 30;
-  static const int IS_LEAF_MASK  = 1 << 31;
+  static const int IS_LEAF_OFFSET        = 31;
+  static const int IS_LEAF_MASK          = 1 << IS_LEAF_OFFSET;
+  static const int DEF_LEFT_OFFSET       = IS_LEAF_OFFSET - 1;
+  static const int DEF_LEFT_MASK         = 1 << DEF_LEFT_OFFSET;
+  static const int IS_CATEGORICAL_OFFSET = DEF_LEFT_OFFSET - 1;
+  static const int IS_CATEGORICAL_MASK   = 1 << IS_CATEGORICAL_OFFSET;
+  static const int FID_MASK              = (1 << IS_CATEGORICAL_OFFSET) - 1;
   template <class o_t>
-  __host__ __device__ o_t output() const
-  {
-    return val;
-  }
+  __host__ __device__ o_t output() const;
+  __host__ __device__ int set() const { return val.idx; }
   __host__ __device__ float thresh() const { return val.f; }
+  __host__ __device__ val_t split() const { return val; }
   __host__ __device__ int fid() const { return bits & FID_MASK; }
   __host__ __device__ bool def_left() const { return bits & DEF_LEFT_MASK; }
   __host__ __device__ bool is_leaf() const { return bits & IS_LEAF_MASK; }
-  __host__ __device__ base_node() : val({.f = 0}), bits(0){};
-  base_node(val_t output, float thresh, int fid, bool def_left, bool is_leaf)
+  __host__ __device__ bool is_categorical() const { return bits & IS_CATEGORICAL_MASK; }
+  __host__ __device__ base_node() : val{}, bits(0) {}
+  base_node(val_t output, val_t split, int fid, bool def_left, bool is_leaf, bool is_categorical)
   {
-    bits = (fid & FID_MASK) | (def_left ? DEF_LEFT_MASK : 0) | (is_leaf ? IS_LEAF_MASK : 0);
+    RAFT_EXPECTS((fid & FID_MASK) == fid, "internal error: feature ID doesn't fit into base_node");
+    bits = (fid & FID_MASK) | (def_left ? DEF_LEFT_MASK : 0) | (is_leaf ? IS_LEAF_MASK : 0) |
+           (is_categorical ? IS_CATEGORICAL_MASK : 0);
     if (is_leaf)
       val = output;
     else
-      val.f = thresh;
+      val = split;
   }
 };
 
+template <>
+__host__ __device__ __forceinline__ float base_node::output<float>() const
+{
+  return val.f;
+}
+template <>
+__host__ __device__ __forceinline__ int base_node::output<int>() const
+{
+  return val.idx;
+}
+template <>
+__host__ __device__ __forceinline__ val_t base_node::output<val_t>() const
+{
+  return val;
+}
+
 /** dense_node is a single node of a dense forest */
 struct alignas(8) dense_node : base_node {
   dense_node() = default;
-  dense_node(val_t output, float thresh, int fid, bool def_left, bool is_leaf)
-    : base_node(output, thresh, fid, def_left, is_leaf)
+  dense_node(val_t output, val_t split, int fid, bool def_left, bool is_leaf, bool is_categorical)
+    : base_node(output, split, fid, def_left, is_leaf, is_categorical)
   {
   }
   /** index of the left child, where curr is the index of the current node */
@@ -129,8 +162,16 @@ struct alignas(16) sparse_node16 : base_node {
   int left_idx;
   int dummy;  // make alignment explicit and reserve for future use
   __host__ __device__ sparse_node16() : left_idx(0), dummy(0) {}
-  sparse_node16(val_t output, float thresh, int fid, bool def_left, bool is_leaf, int left_index)
-    : base_node(output, thresh, fid, def_left, is_leaf), left_idx(left_index), dummy(0)
+  sparse_node16(val_t output,
+                val_t split,
+                int fid,
+                bool def_left,
+                bool is_leaf,
+                bool is_categorical,
+                int left_index)
+    : base_node(output, split, fid, def_left, is_leaf, is_categorical),
+      left_idx(left_index),
+      dummy(0)
   {
   }
   __host__ __device__ int left_index() const { return left_idx; }
@@ -140,28 +181,28 @@ struct alignas(16) sparse_node16 : base_node {
 
 /** sparse_node8 is a node of reduced size (8 bytes) in a sparse forest */
 struct alignas(8) sparse_node8 : base_node {
-  static const int FID_NUM_BITS    = 14;
-  static const int FID_MASK        = (1 << FID_NUM_BITS) - 1;
-  static const int LEFT_OFFSET     = FID_NUM_BITS;
-  static const int LEFT_NUM_BITS   = 16;
-  static const int LEFT_MASK       = ((1 << LEFT_NUM_BITS) - 1) << LEFT_OFFSET;
-  static const int DEF_LEFT_OFFSET = LEFT_OFFSET + LEFT_NUM_BITS;
-  static const int DEF_LEFT_MASK   = 1 << DEF_LEFT_OFFSET;
-  static const int IS_LEAF_OFFSET  = 31;
-  static const int IS_LEAF_MASK    = 1 << IS_LEAF_OFFSET;
+  static const int LEFT_NUM_BITS = 16;
+  static const int FID_NUM_BITS  = IS_CATEGORICAL_OFFSET - LEFT_NUM_BITS;
+  static const int LEFT_OFFSET   = FID_NUM_BITS;
+  static const int FID_MASK      = (1 << FID_NUM_BITS) - 1;
+  static const int LEFT_MASK     = ((1 << LEFT_NUM_BITS) - 1) << LEFT_OFFSET;
   __host__ __device__ int fid() const { return bits & FID_MASK; }
-  __host__ __device__ bool def_left() const { return bits & DEF_LEFT_MASK; }
-  __host__ __device__ bool is_leaf() const { return bits & IS_LEAF_MASK; }
   __host__ __device__ int left_index() const { return (bits & LEFT_MASK) >> LEFT_OFFSET; }
   sparse_node8() = default;
-  sparse_node8(val_t output, float thresh, int fid, bool def_left, bool is_leaf, int left_index)
+  sparse_node8(val_t output,
+               val_t split,
+               int fid,
+               bool def_left,
+               bool is_leaf,
+               bool is_categorical,
+               int left_index)
+    : base_node(output, split, fid, def_left, is_leaf, is_categorical)
   {
-    if (is_leaf)
-      val = output;
-    else
-      val.f = thresh;
-    bits = fid | left_index << LEFT_OFFSET | (def_left ? 1 : 0) << DEF_LEFT_OFFSET |
-           (is_leaf ? 1 : 0) << IS_LEAF_OFFSET;
+    RAFT_EXPECTS((fid & FID_MASK) == fid,
+                 "internal error: feature ID doesn't fit into sparse_node8");
+    RAFT_EXPECTS(((left_index << LEFT_OFFSET) & LEFT_MASK) == (left_index << LEFT_OFFSET),
+                 "internal error: left child index doesn't fit into sparse_node8");
+    bits |= left_index << LEFT_OFFSET;
   }
   /** index of the left child, where curr is the index of the current node */
   __host__ __device__ int left(int curr) const { return left_index(); }
@@ -171,6 +212,8 @@ struct alignas(8) sparse_node8 : base_node {
     and how FIL aggregates them into class margins/regression result/best class
 **/
 enum leaf_algo_t {
+  /** For iteration purposes */
+  MIN_LEAF_ALGO = 0,
   /** storing a class probability or regression summand. We add all margins
       together and determine regression result or use threshold to determine
       one of the two classes. **/
@@ -198,6 +241,7 @@ enum leaf_algo_t {
   /** Leaf contains an index into a vector of class probabilities. **/
   VECTOR_LEAF = 5,
   // to be extended
+  MAX_LEAF_ALGO = 5
 };
 
 template <leaf_algo_t leaf_algo>
@@ -259,13 +303,188 @@ struct forest_params_t {
   // at once inside a block (sharing trees means splitting input rows)
   int threads_per_tree;
   // n_items is how many input samples (items) any thread processes. If 0 is given,
-  // choose most (up to 4) that fit into shared memory.
+  // choose most (up to MAX_N_ITEMS) that fit into shared memory.
   int n_items;
 };
 
 /// FIL_TPB is the number of threads per block to use with FIL kernels
 const int FIL_TPB = 256;
 
+constexpr std::int32_t MAX_PRECISE_INT_FLOAT = 1 << 24;  // 16'777'216
+
+__host__ __device__ __forceinline__ int fetch_bit(const uint8_t* array, int bit)
+{
+  return (array[bit / BITS_PER_BYTE] >> (bit % BITS_PER_BYTE)) & 1;
+}
+
+struct categorical_sets {
+  // arrays are const to use fast GPU read instructions by default
+  // arrays from each node ID are concatenated first, then from all categories
+  const uint8_t* bits = nullptr;
+  // largest matching category in the model, per feature ID
+  const int* max_matching       = nullptr;
+  std::size_t bits_size         = 0;
+  std::size_t max_matching_size = 0;
+
+  __host__ __device__ __forceinline__ bool cats_present() const
+  {
+    // If this is constructed from cat_sets_owner, will return true
+    // default-initialized will return false
+    // Defining edge case: there are categorical nodes, but all have max_matching == -1
+    // (all categorical nodes are empty). node.thresh() would have returned 0.0f
+    // and the branch condition wouldn't have always been false (i.e branched left).
+    // Alternatively, we could have converted all empty categorical nodes to
+    // NAN-threshold numerical nodes.
+    return max_matching != nullptr;
+  }
+
+  // set count is due to tree_idx + node_within_tree_idx are both ints, hence uint32_t result
+  template <typename node_t>
+  __host__ __device__ __forceinline__ int category_matches(node_t node, int category) const
+  {
+    // standard boolean packing. This layout has better ILP
+    // node.set() is global across feature IDs and is an offset (as opposed
+    // to set number). If we run out of uint32_t and we have hundreds of
+    // features with similar categorical feature count, we may consider
+    // storing node ID within nodes with same feature ID and look up
+    // {.max_matching, .first_node_offset} = ...[feature_id]
+    return category <= max_matching[node.fid()] && fetch_bit(bits + node.set(), category);
+  }
+  static int sizeof_mask_from_max_matching(int max_matching)
+  {
+    return raft::ceildiv(max_matching + 1, BITS_PER_BYTE);
+  }
+  int sizeof_mask(int feature_id) const
+  {
+    return sizeof_mask_from_max_matching(max_matching[feature_id]);
+  }
+};
+
+// lets any tree determine a child index for a node in a generic fasion
+// used in fil_test.cu fot its child_index() in CPU predicting
+struct tree_base {
+  categorical_sets cat_sets;
+
+  template <bool CATS_SUPPORTED, typename node_t>
+  __host__ __device__ __forceinline__ int child_index(const node_t& node,
+                                                      int node_idx,
+                                                      float val) const
+  {
+    bool cond;
+
+    if (isnan(val)) {
+      cond = !node.def_left();
+    } else if (CATS_SUPPORTED && node.is_categorical()) {
+      cond = cat_sets.category_matches(node, static_cast<int>(val));
+    } else {
+      cond = val >= node.thresh();
+    }
+    return node.left(node_idx) + cond;
+  }
+};
+
+// -1 means no matching categories
+struct cat_feature_counters {
+  int max_matching = -1;
+  int n_nodes      = 0;
+  static cat_feature_counters combine(cat_feature_counters a, cat_feature_counters b)
+  {
+    return {.max_matching = std::max(a.max_matching, b.max_matching),
+            .n_nodes      = a.n_nodes + b.n_nodes};
+  }
+};
+
+// used only during model import. For inference, trimmed down using cat_sets_owner::accessor()
+// in internal.cuh, as opposed to fil_test.cu, because importing from treelite will require it
+struct cat_sets_owner {
+  // arrays from each node ID are concatenated first, then from all categories
+  std::vector<uint8_t> bits;
+  // largest matching category in the model, per feature ID. uses int because GPU code can only fit
+  // int
+  std::vector<int> max_matching;
+  // how many categorical nodes use a given feature id. Used for model shape string.
+  std::vector<std::size_t> n_nodes;
+  // per tree, size and offset of bit pool within the overall bit pool
+  std::vector<std::size_t> bit_pool_offsets;
+
+  categorical_sets accessor() const
+  {
+    return {
+      .bits              = bits.data(),
+      .max_matching      = max_matching.data(),
+      .bits_size         = bits.size(),
+      .max_matching_size = max_matching.size(),
+    };
+  }
+
+  void consume_counters(const std::vector<cat_feature_counters>& counters)
+  {
+    for (cat_feature_counters cf : counters) {
+      max_matching.push_back(cf.max_matching);
+      n_nodes.push_back(cf.n_nodes);
+    }
+  }
+
+  void consume_bit_pool_sizes(const std::vector<std::size_t>& bit_pool_sizes)
+  {
+    bit_pool_offsets.push_back(0);
+    for (std::size_t i = 0; i < bit_pool_sizes.size() - 1; ++i) {
+      bit_pool_offsets.push_back(bit_pool_offsets.back() + bit_pool_sizes[i]);
+    }
+    bits.resize(bit_pool_offsets.back() + bit_pool_sizes.back());
+  }
+
+  cat_sets_owner() {}
+  cat_sets_owner(std::vector<uint8_t> bits_, std::vector<int> max_matching_)
+    : bits(bits_), max_matching(max_matching_)
+  {
+  }
+};
+
+std::ostream& operator<<(std::ostream& os, const cat_sets_owner& cso);
+
+struct cat_sets_device_owner {
+  // arrays from each node ID are concatenated first, then from all categories
+  rmm::device_uvector<uint8_t> bits;
+  // largest matching category in the model, per feature ID
+  rmm::device_uvector<int> max_matching;
+
+  categorical_sets accessor() const
+  {
+    return {
+      .bits              = bits.data(),
+      .max_matching      = max_matching.data(),
+      .bits_size         = bits.size(),
+      .max_matching_size = max_matching.size(),
+    };
+  }
+  cat_sets_device_owner(cudaStream_t stream) : bits(0, stream), max_matching(0, stream) {}
+  cat_sets_device_owner(categorical_sets cat_sets, cudaStream_t stream)
+    : bits(cat_sets.bits_size, stream), max_matching(cat_sets.max_matching_size, stream)
+  {
+    ASSERT(bits.size() <= static_cast<std::size_t>(INT_MAX) + 1ull,
+           "too many categories/categorical nodes: cannot store bits offset in node");
+    if (cat_sets.max_matching_size > 0) {
+      ASSERT(cat_sets.max_matching != nullptr, "internal error: cat_sets.max_matching is nil");
+      CUDA_CHECK(cudaMemcpyAsync(max_matching.data(),
+                                 cat_sets.max_matching,
+                                 max_matching.size() * sizeof(int),
+                                 cudaMemcpyDefault,
+                                 stream));
+    }
+    if (cat_sets.bits_size > 0) {
+      ASSERT(cat_sets.bits != nullptr, "internal error: cat_sets.bits is nil");
+      CUDA_CHECK(cudaMemcpyAsync(
+        bits.data(), cat_sets.bits, bits.size() * sizeof(uint8_t), cudaMemcpyDefault, stream));
+    }
+  }
+  void release()
+  {
+    bits.release();
+    max_matching.release();
+  }
+};
+
 /** init_dense uses params and nodes to initialize the dense forest stored in pf
  *  @param h cuML handle used by this function
  *  @param pf pointer to where to store the newly created forest
@@ -276,9 +495,10 @@ const int FIL_TPB = 256;
  */
 void init_dense(const raft::handle_t& h,
                 forest_t* pf,
+                const categorical_sets& cat_sets,
+                const std::vector<float>& vector_leaf,
                 const dense_node* nodes,
-                const forest_params_t* params,
-                const std::vector<float>& vector_leaf);
+                const forest_params_t* params);
 
 /** init_sparse uses params, trees and nodes to initialize the sparse forest
  *  with sparse nodes stored in pf
@@ -294,10 +514,13 @@ void init_dense(const raft::handle_t& h,
 template <typename fil_node_t>
 void init_sparse(const raft::handle_t& h,
                  forest_t* pf,
+                 const categorical_sets& cat_sets,
+                 const std::vector<float>& vector_leaf,
                  const int* trees,
                  const fil_node_t* nodes,
-                 const forest_params_t* params,
-                 const std::vector<float>& vector_leaf);
+                 const forest_params_t* params);
 
 }  // namespace fil
+
+std::string output2str(fil::output_t output);
 }  // namespace ML
diff --git a/cpp/src/glm/ols.cuh b/cpp/src/glm/ols.cuh
index 065dc8624c..9e2e8212e0 100644
--- a/cpp/src/glm/ols.cuh
+++ b/cpp/src/glm/ols.cuh
@@ -91,15 +91,20 @@ void olsFit(const raft::handle_t& handle,
                    stream);
   }
 
-  if (algo == 0 || algo == 1) {
-    LinAlg::lstsq(handle, input, n_rows, n_cols, labels, coef, algo, stream);
-  } else if (algo == 2) {
-    LinAlg::lstsqQR(input, n_rows, n_cols, labels, coef, cusolver_handle, cublas_handle, stream);
-  } else if (algo == 3) {
-    ASSERT(false, "olsFit: no algorithm with this id has been implemented");
-  } else {
-    ASSERT(false, "olsFit: no algorithm with this id has been implemented");
+  int selectedAlgo = algo;
+  if (n_cols > n_rows || n_cols == 1) selectedAlgo = 0;
+
+  ML::PUSH_RANGE("Trace::MLCommon::LinAlg::ols-lstsq*", stream);
+  switch (selectedAlgo) {
+    case 0: LinAlg::lstsqSvdJacobi(handle, input, n_rows, n_cols, labels, coef, stream); break;
+    case 1: LinAlg::lstsqEig(handle, input, n_rows, n_cols, labels, coef, stream); break;
+    case 2: LinAlg::lstsqQR(handle, input, n_rows, n_cols, labels, coef, stream); break;
+    case 3: LinAlg::lstsqSvdQR(handle, input, n_rows, n_cols, labels, coef, stream); break;
+    default:
+      ASSERT(false, "olsFit: no algorithm with this id (%d) has been implemented", algo);
+      break;
   }
+  ML::POP_RANGE(stream);
 
   if (fit_intercept) {
     postProcessData(handle,
diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh
index 4397cb91a2..638addf2b1 100644
--- a/cpp/src/hdbscan/detail/reachability.cuh
+++ b/cpp/src/hdbscan/detail/reachability.cuh
@@ -34,7 +34,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuml/neighbors/knn.hpp>
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 
 #include <thrust/transform.h>
 
diff --git a/cpp/src/kmeans/common.cuh b/cpp/src/kmeans/common.cuh
index 8c8df33c67..393c342280 100644
--- a/cpp/src/kmeans/common.cuh
+++ b/cpp/src/kmeans/common.cuh
@@ -30,7 +30,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/comms/comms.hpp>
-#include <raft/distance/fused_l2_nn.cuh>
+#include <raft/distance/fused_l2_nn.hpp>
 #include <raft/linalg/binary_op.cuh>
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/mean_squared_error.cuh>
@@ -61,6 +61,7 @@
 #include <fstream>
 #include <numeric>
 #include <random>
+#include <vector>
 
 namespace ML {
 
diff --git a/cpp/src/knn/knn.cu b/cpp/src/knn/knn.cu
index cee3d34501..0b9fa1640d 100644
--- a/cpp/src/knn/knn.cu
+++ b/cpp/src/knn/knn.cu
@@ -18,6 +18,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/label/classlabels.cuh>
 #include <raft/spatial/knn/ann.hpp>
+#include <raft/spatial/knn/ball_cover.hpp>
 #include <raft/spatial/knn/knn.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -64,6 +65,24 @@ void brute_force_knn(const raft::handle_t& handle,
                                       metric_arg);
 }
 
+void rbc_build_index(const raft::handle_t& handle,
+                     raft::spatial::knn::BallCoverIndex<int64_t, float, uint32_t>& index)
+{
+  raft::spatial::knn::rbc_build_index(handle, index);
+}
+
+void rbc_knn_query(const raft::handle_t& handle,
+                   raft::spatial::knn::BallCoverIndex<int64_t, float, uint32_t>& index,
+                   uint32_t k,
+                   const float* search_items,
+                   uint32_t n_search_items,
+                   int64_t* out_inds,
+                   float* out_dists)
+{
+  raft::spatial::knn::rbc_knn_query(
+    handle, index, k, search_items, n_search_items, out_inds, out_dists);
+}
+
 void approx_knn_build_index(raft::handle_t& handle,
                             raft::spatial::knn::knnIndex* index,
                             raft::spatial::knn::knnIndexParam* params,
diff --git a/cpp/src/metrics/pairwise_distance.cu b/cpp/src/metrics/pairwise_distance.cu
index 47af2985c4..1be8d14cc6 100644
--- a/cpp/src/metrics/pairwise_distance.cu
+++ b/cpp/src/metrics/pairwise_distance.cu
@@ -17,16 +17,21 @@
 
 #include <raft/sparse/distance/common.h>
 #include <cuml/metrics/metrics.hpp>
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/handle.hpp>
-#include <raft/sparse/distance/distance.cuh>
+#include <raft/sparse/distance/distance.hpp>
 #include "pairwise_distance_canberra.cuh"
 #include "pairwise_distance_chebyshev.cuh"
+#include "pairwise_distance_correlation.cuh"
 #include "pairwise_distance_cosine.cuh"
 #include "pairwise_distance_euclidean.cuh"
+#include "pairwise_distance_hamming.cuh"
 #include "pairwise_distance_hellinger.cuh"
+#include "pairwise_distance_jensen_shannon.cuh"
+#include "pairwise_distance_kl_divergence.cuh"
 #include "pairwise_distance_l1.cuh"
 #include "pairwise_distance_minkowski.cuh"
+#include "pairwise_distance_russell_rao.cuh"
 
 namespace ML {
 
@@ -50,22 +55,37 @@ void pairwise_distance(const raft::handle_t& handle,
       pairwise_distance_euclidean(handle, x, y, dist, m, n, k, metric, isRowMajor, metric_arg);
       break;
     case raft::distance::DistanceType::CosineExpanded:
-      pairwise_distance_cosine(handle, x, y, dist, m, n, k, metric, isRowMajor, metric_arg);
+      pairwise_distance_cosine(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
       break;
     case raft::distance::DistanceType::L1:
-      pairwise_distance_l1(handle, x, y, dist, m, n, k, metric, isRowMajor, metric_arg);
+      pairwise_distance_l1(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
       break;
     case raft::distance::DistanceType::Linf:
-      pairwise_distance_chebyshev(handle, x, y, dist, m, n, k, metric, isRowMajor, metric_arg);
+      pairwise_distance_chebyshev(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
       break;
     case raft::distance::DistanceType::HellingerExpanded:
-      pairwise_distance_hellinger(handle, x, y, dist, m, n, k, metric, isRowMajor, metric_arg);
+      pairwise_distance_hellinger(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
       break;
     case raft::distance::DistanceType::LpUnexpanded:
-      pairwise_distance_minkowski(handle, x, y, dist, m, n, k, metric, isRowMajor, metric_arg);
+      pairwise_distance_minkowski(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
       break;
     case raft::distance::DistanceType::Canberra:
-      pairwise_distance_canberra(handle, x, y, dist, m, n, k, metric, isRowMajor, metric_arg);
+      pairwise_distance_canberra(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
+      break;
+    case raft::distance::DistanceType::CorrelationExpanded:
+      pairwise_distance_correlation(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
+      break;
+    case raft::distance::DistanceType::HammingUnexpanded:
+      pairwise_distance_hamming(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
+      break;
+    case raft::distance::DistanceType::JensenShannon:
+      pairwise_distance_jensen_shannon(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
+      break;
+    case raft::distance::DistanceType::KLDivergence:
+      pairwise_distance_kl_divergence(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
+      break;
+    case raft::distance::DistanceType::RusselRaoExpanded:
+      pairwise_distance_russell_rao(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
       break;
     default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
   };
@@ -90,22 +110,37 @@ void pairwise_distance(const raft::handle_t& handle,
       pairwise_distance_euclidean(handle, x, y, dist, m, n, k, metric, isRowMajor, metric_arg);
       break;
     case raft::distance::DistanceType::CosineExpanded:
-      pairwise_distance_cosine(handle, x, y, dist, m, n, k, metric, isRowMajor, metric_arg);
+      pairwise_distance_cosine(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
       break;
     case raft::distance::DistanceType::L1:
-      pairwise_distance_l1(handle, x, y, dist, m, n, k, metric, isRowMajor, metric_arg);
+      pairwise_distance_l1(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
       break;
     case raft::distance::DistanceType::Linf:
-      pairwise_distance_chebyshev(handle, x, y, dist, m, n, k, metric, isRowMajor, metric_arg);
+      pairwise_distance_chebyshev(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
       break;
     case raft::distance::DistanceType::HellingerExpanded:
-      pairwise_distance_hellinger(handle, x, y, dist, m, n, k, metric, isRowMajor, metric_arg);
+      pairwise_distance_hellinger(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
       break;
     case raft::distance::DistanceType::LpUnexpanded:
-      pairwise_distance_minkowski(handle, x, y, dist, m, n, k, metric, isRowMajor, metric_arg);
+      pairwise_distance_minkowski(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
       break;
     case raft::distance::DistanceType::Canberra:
-      pairwise_distance_canberra(handle, x, y, dist, m, n, k, metric, isRowMajor, metric_arg);
+      pairwise_distance_canberra(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
+      break;
+    case raft::distance::DistanceType::CorrelationExpanded:
+      pairwise_distance_correlation(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
+      break;
+    case raft::distance::DistanceType::HammingUnexpanded:
+      pairwise_distance_hamming(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
+      break;
+    case raft::distance::DistanceType::JensenShannon:
+      pairwise_distance_jensen_shannon(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
+      break;
+    case raft::distance::DistanceType::KLDivergence:
+      pairwise_distance_kl_divergence(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
+      break;
+    case raft::distance::DistanceType::RusselRaoExpanded:
+      pairwise_distance_russell_rao(handle, x, y, dist, m, n, k, isRowMajor, metric_arg);
       break;
     default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
   };
diff --git a/cpp/src/metrics/pairwise_distance_canberra.cu b/cpp/src/metrics/pairwise_distance_canberra.cu
index fb0520c4bd..b2534d0c02 100644
--- a/cpp/src/metrics/pairwise_distance_canberra.cu
+++ b/cpp/src/metrics/pairwise_distance_canberra.cu
@@ -15,10 +15,10 @@
  * limitations under the License.
  */
 
-//#include <cuml/metrics/metrics.hpp>
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
+#include "pairwise_distance_canberra.cuh"
 
 namespace ML {
 
@@ -30,25 +30,12 @@ void pairwise_distance_canberra(const raft::handle_t& handle,
                                 int m,
                                 int n,
                                 int k,
-                                raft::distance::DistanceType metric,
                                 bool isRowMajor,
                                 double metric_arg)
 {
-  // Allocate workspace
-  rmm::device_uvector<char> workspace(1, handle.get_stream());
-
   // Call the distance function
-  /*  raft::distance::pairwise_distance(x, y, dist, m, n, k, workspace, metric,
-                                    handle.get_stream(), isRowMajor,
-                                    metric_arg);*/
-
-  switch (metric) {
-    case raft::distance::DistanceType::Canberra:
-      raft::distance::pairwise_distance_impl<double, int, raft::distance::DistanceType::Canberra>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
-  }
+  raft::distance::distance<raft::distance::DistanceType::Canberra, double, double, double, int>(
+    x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
 }
 
 void pairwise_distance_canberra(const raft::handle_t& handle,
@@ -58,25 +45,12 @@ void pairwise_distance_canberra(const raft::handle_t& handle,
                                 int m,
                                 int n,
                                 int k,
-                                raft::distance::DistanceType metric,
                                 bool isRowMajor,
                                 float metric_arg)
 {
-  // Allocate workspace
-  rmm::device_uvector<char> workspace(1, handle.get_stream());
-
   // Call the distance function
-  /*  raft::distance::pairwise_distance(x, y, dist, m, n, k, workspace, metric,
-                                    handle.get_stream(), isRowMajor,
-                                    metric_arg);*/
-
-  switch (metric) {
-    case raft::distance::DistanceType::Canberra:
-      raft::distance::pairwise_distance_impl<float, int, raft::distance::DistanceType::Canberra>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
-  }
+  raft::distance::distance<raft::distance::DistanceType::Canberra, float, float, float, int>(
+    x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
 }
 
 }  // namespace Metrics
diff --git a/cpp/src/metrics/pairwise_distance_canberra.cuh b/cpp/src/metrics/pairwise_distance_canberra.cuh
index 3d1454cfcc..07e874ed0a 100644
--- a/cpp/src/metrics/pairwise_distance_canberra.cuh
+++ b/cpp/src/metrics/pairwise_distance_canberra.cuh
@@ -17,7 +17,7 @@
 
 #pragma once
 
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/handle.hpp>
 
 namespace ML {
@@ -30,7 +30,6 @@ void pairwise_distance_canberra(const raft::handle_t& handle,
                                 int m,
                                 int n,
                                 int k,
-                                raft::distance::DistanceType metric,
                                 bool isRowMajor,
                                 double metric_arg);
 
@@ -41,7 +40,6 @@ void pairwise_distance_canberra(const raft::handle_t& handle,
                                 int m,
                                 int n,
                                 int k,
-                                raft::distance::DistanceType metric,
                                 bool isRowMajor,
                                 float metric_arg);
 
diff --git a/cpp/src/metrics/pairwise_distance_chebyshev.cu b/cpp/src/metrics/pairwise_distance_chebyshev.cu
index d3bd683c89..27cc7aeeb5 100644
--- a/cpp/src/metrics/pairwise_distance_chebyshev.cu
+++ b/cpp/src/metrics/pairwise_distance_chebyshev.cu
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
 #include "pairwise_distance_chebyshev.cuh"
@@ -29,20 +29,12 @@ void pairwise_distance_chebyshev(const raft::handle_t& handle,
                                  int m,
                                  int n,
                                  int k,
-                                 raft::distance::DistanceType metric,
                                  bool isRowMajor,
                                  double metric_arg)
 {
-  // Allocate workspace
-  rmm::device_uvector<char> workspace(1, handle.get_stream());
   // Call the distance function
-  switch (metric) {
-    case raft::distance::DistanceType::Linf:
-      raft::distance::pairwise_distance_impl<double, int, raft::distance::DistanceType::Linf>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
-  }
+  raft::distance::distance<raft::distance::DistanceType::Linf, double, double, double, int>(
+    x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
 }
 
 void pairwise_distance_chebyshev(const raft::handle_t& handle,
@@ -52,20 +44,12 @@ void pairwise_distance_chebyshev(const raft::handle_t& handle,
                                  int m,
                                  int n,
                                  int k,
-                                 raft::distance::DistanceType metric,
                                  bool isRowMajor,
                                  float metric_arg)
 {
-  // Allocate workspace
-  rmm::device_uvector<char> workspace(1, handle.get_stream());
   // Call the distance function
-  switch (metric) {
-    case raft::distance::DistanceType::Linf:
-      raft::distance::pairwise_distance_impl<float, int, raft::distance::DistanceType::Linf>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
-  }
+  raft::distance::distance<raft::distance::DistanceType::Linf, float, float, float, int>(
+    x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
 }
 
 }  // namespace Metrics
diff --git a/cpp/src/metrics/pairwise_distance_chebyshev.cuh b/cpp/src/metrics/pairwise_distance_chebyshev.cuh
index 6f95dbba30..6682479a47 100644
--- a/cpp/src/metrics/pairwise_distance_chebyshev.cuh
+++ b/cpp/src/metrics/pairwise_distance_chebyshev.cuh
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 #pragma once
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/handle.hpp>
 
 namespace ML {
@@ -28,7 +28,6 @@ void pairwise_distance_chebyshev(const raft::handle_t& handle,
                                  int m,
                                  int n,
                                  int k,
-                                 raft::distance::DistanceType metric,
                                  bool isRowMajor,
                                  double metric_arg);
 
@@ -39,7 +38,6 @@ void pairwise_distance_chebyshev(const raft::handle_t& handle,
                                  int m,
                                  int n,
                                  int k,
-                                 raft::distance::DistanceType metric,
                                  bool isRowMajor,
                                  float metric_arg);
 
diff --git a/cpp/src/metrics/pairwise_distance_correlation.cu b/cpp/src/metrics/pairwise_distance_correlation.cu
new file mode 100644
index 0000000000..1b5b654cb2
--- /dev/null
+++ b/cpp/src/metrics/pairwise_distance_correlation.cu
@@ -0,0 +1,59 @@
+
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/distance.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include "pairwise_distance_correlation.cuh"
+
+namespace ML {
+
+namespace Metrics {
+void pairwise_distance_correlation(const raft::handle_t& handle,
+                                   const double* x,
+                                   const double* y,
+                                   double* dist,
+                                   int m,
+                                   int n,
+                                   int k,
+                                   bool isRowMajor,
+                                   double metric_arg)
+{
+  // Call the distance function
+  raft::distance::
+    distance<raft::distance::DistanceType::CorrelationExpanded, double, double, double, int>(
+      x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
+}
+
+void pairwise_distance_correlation(const raft::handle_t& handle,
+                                   const float* x,
+                                   const float* y,
+                                   float* dist,
+                                   int m,
+                                   int n,
+                                   int k,
+                                   bool isRowMajor,
+                                   float metric_arg)
+{
+  // Call the distance function
+  raft::distance::
+    distance<raft::distance::DistanceType::CorrelationExpanded, float, float, float, int>(
+      x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
+}
+
+}  // namespace Metrics
+}  // namespace ML
diff --git a/cpp/src/metrics/pairwise_distance_correlation.cuh b/cpp/src/metrics/pairwise_distance_correlation.cuh
new file mode 100644
index 0000000000..a55a24528a
--- /dev/null
+++ b/cpp/src/metrics/pairwise_distance_correlation.cuh
@@ -0,0 +1,47 @@
+
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/distance.hpp>
+#include <raft/handle.hpp>
+
+namespace ML {
+
+namespace Metrics {
+void pairwise_distance_correlation(const raft::handle_t& handle,
+                                   const double* x,
+                                   const double* y,
+                                   double* dist,
+                                   int m,
+                                   int n,
+                                   int k,
+                                   bool isRowMajor,
+                                   double metric_arg);
+
+void pairwise_distance_correlation(const raft::handle_t& handle,
+                                   const float* x,
+                                   const float* y,
+                                   float* dist,
+                                   int m,
+                                   int n,
+                                   int k,
+                                   bool isRowMajor,
+                                   float metric_arg);
+
+}  // namespace Metrics
+}  // namespace ML
diff --git a/cpp/src/metrics/pairwise_distance_cosine.cu b/cpp/src/metrics/pairwise_distance_cosine.cu
index 5d94fe7a26..7c9ef96d0f 100644
--- a/cpp/src/metrics/pairwise_distance_cosine.cu
+++ b/cpp/src/metrics/pairwise_distance_cosine.cu
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
 #include "pairwise_distance_cosine.cuh"
@@ -30,22 +30,13 @@ void pairwise_distance_cosine(const raft::handle_t& handle,
                               int m,
                               int n,
                               int k,
-                              raft::distance::DistanceType metric,
                               bool isRowMajor,
                               double metric_arg)
 {
-  // Allocate workspace
-  rmm::device_uvector<char> workspace(1, handle.get_stream());
-
   // Call the distance function
-  switch (metric) {
-    case raft::distance::DistanceType::CosineExpanded:
-      raft::distance::
-        pairwise_distance_impl<double, int, raft::distance::DistanceType::CosineExpanded>(
-          x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
-  }
+  raft::distance::
+    distance<raft::distance::DistanceType::CosineExpanded, double, double, double, int>(
+      x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
 }
 
 void pairwise_distance_cosine(const raft::handle_t& handle,
@@ -55,20 +46,12 @@ void pairwise_distance_cosine(const raft::handle_t& handle,
                               int m,
                               int n,
                               int k,
-                              raft::distance::DistanceType metric,
                               bool isRowMajor,
                               float metric_arg)
 {
-  // Allocate workspace
-  rmm::device_uvector<char> workspace(1, handle.get_stream());
-  switch (metric) {
-    case raft::distance::DistanceType::CosineExpanded:
-      raft::distance::
-        pairwise_distance_impl<float, int, raft::distance::DistanceType::CosineExpanded>(
-          x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
-  }
+  // Call the distance function
+  raft::distance::distance<raft::distance::DistanceType::CosineExpanded, float, float, float, int>(
+    x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
 }
 
 }  // namespace Metrics
diff --git a/cpp/src/metrics/pairwise_distance_cosine.cuh b/cpp/src/metrics/pairwise_distance_cosine.cuh
index 04f07e7de7..714bb9157d 100644
--- a/cpp/src/metrics/pairwise_distance_cosine.cuh
+++ b/cpp/src/metrics/pairwise_distance_cosine.cuh
@@ -16,7 +16,7 @@
  */
 #pragma once
 
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/handle.hpp>
 
 namespace ML {
@@ -29,7 +29,6 @@ void pairwise_distance_cosine(const raft::handle_t& handle,
                               int m,
                               int n,
                               int k,
-                              raft::distance::DistanceType metric,
                               bool isRowMajor,
                               double metric_arg);
 
@@ -40,7 +39,6 @@ void pairwise_distance_cosine(const raft::handle_t& handle,
                               int m,
                               int n,
                               int k,
-                              raft::distance::DistanceType metric,
                               bool isRowMajor,
                               float metric_arg);
 
diff --git a/cpp/src/metrics/pairwise_distance_euclidean.cu b/cpp/src/metrics/pairwise_distance_euclidean.cu
index 6b06f8beac..55a27e9f17 100644
--- a/cpp/src/metrics/pairwise_distance_euclidean.cu
+++ b/cpp/src/metrics/pairwise_distance_euclidean.cu
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
 #include "pairwise_distance_euclidean.cuh"
@@ -34,29 +34,27 @@ void pairwise_distance_euclidean(const raft::handle_t& handle,
                                  bool isRowMajor,
                                  double metric_arg)
 {
-  // Allocate workspace
-  rmm::device_uvector<char> workspace(1, handle.get_stream());
-
   // Call the distance function
   switch (metric) {
     case raft::distance::DistanceType::L2Expanded:
-      raft::distance::pairwise_distance_impl<double, int, raft::distance::DistanceType::L2Expanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      raft::distance::
+        distance<raft::distance::DistanceType::L2Expanded, double, double, double, int>(
+          x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::L2SqrtExpanded:
       raft::distance::
-        pairwise_distance_impl<double, int, raft::distance::DistanceType::L2SqrtExpanded>(
-          x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+        distance<raft::distance::DistanceType::L2SqrtExpanded, double, double, double, int>(
+          x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::L2Unexpanded:
       raft::distance::
-        pairwise_distance_impl<double, int, raft::distance::DistanceType::L2Unexpanded>(
-          x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+        distance<raft::distance::DistanceType::L2Unexpanded, double, double, double, int>(
+          x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::L2SqrtUnexpanded:
       raft::distance::
-        pairwise_distance_impl<double, int, raft::distance::DistanceType::L2SqrtUnexpanded>(
-          x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+        distance<raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int>(
+          x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
       break;
     default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
   }
@@ -73,29 +71,26 @@ void pairwise_distance_euclidean(const raft::handle_t& handle,
                                  bool isRowMajor,
                                  float metric_arg)
 {
-  // Allocate workspace
-  rmm::device_uvector<char> workspace(1, handle.get_stream());
-
   // Call the distance function
   switch (metric) {
     case raft::distance::DistanceType::L2Expanded:
-      raft::distance::pairwise_distance_impl<float, int, raft::distance::DistanceType::L2Expanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      raft::distance::distance<raft::distance::DistanceType::L2Expanded, float, float, float, int>(
+        x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::L2SqrtExpanded:
       raft::distance::
-        pairwise_distance_impl<float, int, raft::distance::DistanceType::L2SqrtExpanded>(
-          x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+        distance<raft::distance::DistanceType::L2SqrtExpanded, float, float, float, int>(
+          x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::L2Unexpanded:
       raft::distance::
-        pairwise_distance_impl<float, int, raft::distance::DistanceType::L2Unexpanded>(
-          x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+        distance<raft::distance::DistanceType::L2Unexpanded, float, float, float, int>(
+          x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::L2SqrtUnexpanded:
       raft::distance::
-        pairwise_distance_impl<float, int, raft::distance::DistanceType::L2SqrtUnexpanded>(
-          x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+        distance<raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int>(
+          x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
       break;
     default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
   }
diff --git a/cpp/src/metrics/pairwise_distance_euclidean.cuh b/cpp/src/metrics/pairwise_distance_euclidean.cuh
index c94b9a6515..509b88eb2c 100644
--- a/cpp/src/metrics/pairwise_distance_euclidean.cuh
+++ b/cpp/src/metrics/pairwise_distance_euclidean.cuh
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 #pragma once
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/handle.hpp>
 
 namespace ML {
diff --git a/cpp/src/metrics/pairwise_distance_hamming.cu b/cpp/src/metrics/pairwise_distance_hamming.cu
new file mode 100644
index 0000000000..2880319647
--- /dev/null
+++ b/cpp/src/metrics/pairwise_distance_hamming.cu
@@ -0,0 +1,59 @@
+
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/distance.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include "pairwise_distance_hamming.cuh"
+
+namespace ML {
+
+namespace Metrics {
+void pairwise_distance_hamming(const raft::handle_t& handle,
+                               const double* x,
+                               const double* y,
+                               double* dist,
+                               int m,
+                               int n,
+                               int k,
+                               bool isRowMajor,
+                               double metric_arg)
+{
+  // Call the distance function
+  raft::distance::
+    distance<raft::distance::DistanceType::HammingUnexpanded, double, double, double, int>(
+      x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
+}
+
+void pairwise_distance_hamming(const raft::handle_t& handle,
+                               const float* x,
+                               const float* y,
+                               float* dist,
+                               int m,
+                               int n,
+                               int k,
+                               bool isRowMajor,
+                               float metric_arg)
+{
+  // Call the distance function
+  raft::distance::
+    distance<raft::distance::DistanceType::HammingUnexpanded, float, float, float, int>(
+      x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
+}
+
+}  // namespace Metrics
+}  // namespace ML
diff --git a/cpp/src/metrics/pairwise_distance_hamming.cuh b/cpp/src/metrics/pairwise_distance_hamming.cuh
new file mode 100644
index 0000000000..6551c728e6
--- /dev/null
+++ b/cpp/src/metrics/pairwise_distance_hamming.cuh
@@ -0,0 +1,47 @@
+
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/distance.hpp>
+#include <raft/handle.hpp>
+
+namespace ML {
+
+namespace Metrics {
+void pairwise_distance_hamming(const raft::handle_t& handle,
+                               const double* x,
+                               const double* y,
+                               double* dist,
+                               int m,
+                               int n,
+                               int k,
+                               bool isRowMajor,
+                               double metric_arg);
+
+void pairwise_distance_hamming(const raft::handle_t& handle,
+                               const float* x,
+                               const float* y,
+                               float* dist,
+                               int m,
+                               int n,
+                               int k,
+                               bool isRowMajor,
+                               float metric_arg);
+
+}  // namespace Metrics
+}  // namespace ML
diff --git a/cpp/src/metrics/pairwise_distance_hellinger.cu b/cpp/src/metrics/pairwise_distance_hellinger.cu
index 44c50e57c9..fcef6a2921 100644
--- a/cpp/src/metrics/pairwise_distance_hellinger.cu
+++ b/cpp/src/metrics/pairwise_distance_hellinger.cu
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
 #include "pairwise_distance_hellinger.cuh"
@@ -30,21 +30,13 @@ void pairwise_distance_hellinger(const raft::handle_t& handle,
                                  int m,
                                  int n,
                                  int k,
-                                 raft::distance::DistanceType metric,
                                  bool isRowMajor,
                                  double metric_arg)
 {
-  // Allocate workspace
-  rmm::device_uvector<char> workspace(1, handle.get_stream());
   // Call the distance function
-  switch (metric) {
-    case raft::distance::DistanceType::HellingerExpanded:
-      raft::distance::
-        pairwise_distance_impl<double, int, raft::distance::DistanceType::HellingerExpanded>(
-          x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
-  }
+  raft::distance::
+    distance<raft::distance::DistanceType::HellingerExpanded, double, double, double, int>(
+      x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
 }
 
 void pairwise_distance_hellinger(const raft::handle_t& handle,
@@ -54,21 +46,12 @@ void pairwise_distance_hellinger(const raft::handle_t& handle,
                                  int m,
                                  int n,
                                  int k,
-                                 raft::distance::DistanceType metric,
                                  bool isRowMajor,
                                  float metric_arg)
 {
-  // Allocate workspace
-  rmm::device_uvector<char> workspace(1, handle.get_stream());
-  // Call the distance function
-  switch (metric) {
-    case raft::distance::DistanceType::HellingerExpanded:
-      raft::distance::
-        pairwise_distance_impl<float, int, raft::distance::DistanceType::HellingerExpanded>(
-          x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
-  }
+  raft::distance::
+    distance<raft::distance::DistanceType::HellingerExpanded, float, float, float, int>(
+      x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
 }
 
 }  // namespace Metrics
diff --git a/cpp/src/metrics/pairwise_distance_hellinger.cuh b/cpp/src/metrics/pairwise_distance_hellinger.cuh
index 70521b6578..560e413b53 100644
--- a/cpp/src/metrics/pairwise_distance_hellinger.cuh
+++ b/cpp/src/metrics/pairwise_distance_hellinger.cuh
@@ -16,7 +16,7 @@
  */
 #pragma once
 
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/handle.hpp>
 
 namespace ML {
@@ -29,7 +29,6 @@ void pairwise_distance_hellinger(const raft::handle_t& handle,
                                  int m,
                                  int n,
                                  int k,
-                                 raft::distance::DistanceType metric,
                                  bool isRowMajor,
                                  double metric_arg);
 
@@ -40,7 +39,6 @@ void pairwise_distance_hellinger(const raft::handle_t& handle,
                                  int m,
                                  int n,
                                  int k,
-                                 raft::distance::DistanceType metric,
                                  bool isRowMajor,
                                  float metric_arg);
 }  // namespace Metrics
diff --git a/cpp/src/metrics/pairwise_distance_jensen_shannon.cu b/cpp/src/metrics/pairwise_distance_jensen_shannon.cu
new file mode 100644
index 0000000000..7fe0d66c64
--- /dev/null
+++ b/cpp/src/metrics/pairwise_distance_jensen_shannon.cu
@@ -0,0 +1,56 @@
+
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/distance.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include "pairwise_distance_jensen_shannon.cuh"
+
+namespace ML {
+
+namespace Metrics {
+void pairwise_distance_jensen_shannon(const raft::handle_t& handle,
+                                      const double* x,
+                                      const double* y,
+                                      double* dist,
+                                      int m,
+                                      int n,
+                                      int k,
+                                      bool isRowMajor,
+                                      double metric_arg)
+{
+  raft::distance::
+    distance<raft::distance::DistanceType::JensenShannon, double, double, double, int>(
+      x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
+}
+
+void pairwise_distance_jensen_shannon(const raft::handle_t& handle,
+                                      const float* x,
+                                      const float* y,
+                                      float* dist,
+                                      int m,
+                                      int n,
+                                      int k,
+                                      bool isRowMajor,
+                                      float metric_arg)
+{
+  raft::distance::distance<raft::distance::DistanceType::JensenShannon, float, float, float, int>(
+    x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
+}
+
+}  // namespace Metrics
+}  // namespace ML
diff --git a/cpp/src/metrics/pairwise_distance_jensen_shannon.cuh b/cpp/src/metrics/pairwise_distance_jensen_shannon.cuh
new file mode 100644
index 0000000000..dab4b64657
--- /dev/null
+++ b/cpp/src/metrics/pairwise_distance_jensen_shannon.cuh
@@ -0,0 +1,47 @@
+
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/distance.hpp>
+#include <raft/handle.hpp>
+
+namespace ML {
+
+namespace Metrics {
+void pairwise_distance_jensen_shannon(const raft::handle_t& handle,
+                                      const double* x,
+                                      const double* y,
+                                      double* dist,
+                                      int m,
+                                      int n,
+                                      int k,
+                                      bool isRowMajor,
+                                      double metric_arg);
+
+void pairwise_distance_jensen_shannon(const raft::handle_t& handle,
+                                      const float* x,
+                                      const float* y,
+                                      float* dist,
+                                      int m,
+                                      int n,
+                                      int k,
+                                      bool isRowMajor,
+                                      float metric_arg);
+
+}  // namespace Metrics
+}  // namespace ML
diff --git a/cpp/src/metrics/pairwise_distance_kl_divergence.cu b/cpp/src/metrics/pairwise_distance_kl_divergence.cu
new file mode 100644
index 0000000000..10c75ea7a2
--- /dev/null
+++ b/cpp/src/metrics/pairwise_distance_kl_divergence.cu
@@ -0,0 +1,55 @@
+
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/distance.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include "pairwise_distance_kl_divergence.cuh"
+
+namespace ML {
+
+namespace Metrics {
+void pairwise_distance_kl_divergence(const raft::handle_t& handle,
+                                     const double* x,
+                                     const double* y,
+                                     double* dist,
+                                     int m,
+                                     int n,
+                                     int k,
+                                     bool isRowMajor,
+                                     double metric_arg)
+{
+  raft::distance::distance<raft::distance::DistanceType::KLDivergence, double, double, double, int>(
+    x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
+}
+
+void pairwise_distance_kl_divergence(const raft::handle_t& handle,
+                                     const float* x,
+                                     const float* y,
+                                     float* dist,
+                                     int m,
+                                     int n,
+                                     int k,
+                                     bool isRowMajor,
+                                     float metric_arg)
+{
+  raft::distance::distance<raft::distance::DistanceType::KLDivergence, float, float, float, int>(
+    x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
+}
+
+}  // namespace Metrics
+}  // namespace ML
diff --git a/cpp/src/metrics/pairwise_distance_kl_divergence.cuh b/cpp/src/metrics/pairwise_distance_kl_divergence.cuh
new file mode 100644
index 0000000000..301ed1ba5f
--- /dev/null
+++ b/cpp/src/metrics/pairwise_distance_kl_divergence.cuh
@@ -0,0 +1,47 @@
+
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/distance.hpp>
+#include <raft/handle.hpp>
+
+namespace ML {
+
+namespace Metrics {
+void pairwise_distance_kl_divergence(const raft::handle_t& handle,
+                                     const double* x,
+                                     const double* y,
+                                     double* dist,
+                                     int m,
+                                     int n,
+                                     int k,
+                                     bool isRowMajor,
+                                     double metric_arg);
+
+void pairwise_distance_kl_divergence(const raft::handle_t& handle,
+                                     const float* x,
+                                     const float* y,
+                                     float* dist,
+                                     int m,
+                                     int n,
+                                     int k,
+                                     bool isRowMajor,
+                                     float metric_arg);
+
+}  // namespace Metrics
+}  // namespace ML
diff --git a/cpp/src/metrics/pairwise_distance_l1.cu b/cpp/src/metrics/pairwise_distance_l1.cu
index 1863f582af..0a9d8f808d 100644
--- a/cpp/src/metrics/pairwise_distance_l1.cu
+++ b/cpp/src/metrics/pairwise_distance_l1.cu
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
 #include "pairwise_distance_l1.cuh"
@@ -30,20 +30,11 @@ void pairwise_distance_l1(const raft::handle_t& handle,
                           int m,
                           int n,
                           int k,
-                          raft::distance::DistanceType metric,
                           bool isRowMajor,
                           double metric_arg)
 {
-  // Allocate workspace
-  rmm::device_uvector<char> workspace(1, handle.get_stream());
-  // Call the distance function
-  switch (metric) {
-    case raft::distance::DistanceType::L1:
-      raft::distance::pairwise_distance_impl<double, int, raft::distance::DistanceType::L1>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
-  }
+  raft::distance::distance<raft::distance::DistanceType::L1, double, double, double, int>(
+    x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
 }
 
 void pairwise_distance_l1(const raft::handle_t& handle,
@@ -53,20 +44,11 @@ void pairwise_distance_l1(const raft::handle_t& handle,
                           int m,
                           int n,
                           int k,
-                          raft::distance::DistanceType metric,
                           bool isRowMajor,
                           float metric_arg)
 {
-  // Allocate workspace
-  rmm::device_uvector<char> workspace(1, handle.get_stream());
-  // Call the distance function
-  switch (metric) {
-    case raft::distance::DistanceType::L1:
-      raft::distance::pairwise_distance_impl<float, int, raft::distance::DistanceType::L1>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
-  }
+  raft::distance::distance<raft::distance::DistanceType::L1, float, float, float, int>(
+    x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
 }
 
 }  // namespace Metrics
diff --git a/cpp/src/metrics/pairwise_distance_l1.cuh b/cpp/src/metrics/pairwise_distance_l1.cuh
index f451df5cc8..0d63a2bec7 100644
--- a/cpp/src/metrics/pairwise_distance_l1.cuh
+++ b/cpp/src/metrics/pairwise_distance_l1.cuh
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 #pragma once
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/handle.hpp>
 
 namespace ML {
@@ -28,7 +28,6 @@ void pairwise_distance_l1(const raft::handle_t& handle,
                           int m,
                           int n,
                           int k,
-                          raft::distance::DistanceType metric,
                           bool isRowMajor,
                           double metric_arg);
 
@@ -39,7 +38,6 @@ void pairwise_distance_l1(const raft::handle_t& handle,
                           int m,
                           int n,
                           int k,
-                          raft::distance::DistanceType metric,
                           bool isRowMajor,
                           float metric_arg);
 
diff --git a/cpp/src/metrics/pairwise_distance_minkowski.cu b/cpp/src/metrics/pairwise_distance_minkowski.cu
index 6772edeff2..11e1367d6e 100644
--- a/cpp/src/metrics/pairwise_distance_minkowski.cu
+++ b/cpp/src/metrics/pairwise_distance_minkowski.cu
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
 #include "pairwise_distance_minkowski.cuh"
@@ -30,21 +30,11 @@ void pairwise_distance_minkowski(const raft::handle_t& handle,
                                  int m,
                                  int n,
                                  int k,
-                                 raft::distance::DistanceType metric,
                                  bool isRowMajor,
                                  double metric_arg)
 {
-  // Allocate workspace
-  rmm::device_uvector<char> workspace(1, handle.get_stream());
-  // Call the distance function
-  switch (metric) {
-    case raft::distance::DistanceType::LpUnexpanded:
-      raft::distance::
-        pairwise_distance_impl<double, int, raft::distance::DistanceType::LpUnexpanded>(
-          x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor, metric_arg);
-      break;
-    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
-  }
+  raft::distance::distance<raft::distance::DistanceType::LpUnexpanded, double, double, double, int>(
+    x, y, dist, m, n, k, handle.get_stream(), isRowMajor, metric_arg);
 }
 
 void pairwise_distance_minkowski(const raft::handle_t& handle,
@@ -54,21 +44,11 @@ void pairwise_distance_minkowski(const raft::handle_t& handle,
                                  int m,
                                  int n,
                                  int k,
-                                 raft::distance::DistanceType metric,
                                  bool isRowMajor,
                                  float metric_arg)
 {
-  // Allocate workspace
-  rmm::device_uvector<char> workspace(1, handle.get_stream());
-  // Call the distance function
-  switch (metric) {
-    case raft::distance::DistanceType::LpUnexpanded:
-      raft::distance::
-        pairwise_distance_impl<float, int, raft::distance::DistanceType::LpUnexpanded>(
-          x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor, metric_arg);
-      break;
-    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
-  }
+  raft::distance::distance<raft::distance::DistanceType::LpUnexpanded, float, float, float, int>(
+    x, y, dist, m, n, k, handle.get_stream(), isRowMajor, metric_arg);
 }
 
 }  // namespace Metrics
diff --git a/cpp/src/metrics/pairwise_distance_minkowski.cuh b/cpp/src/metrics/pairwise_distance_minkowski.cuh
index 013205e67b..b1a2824254 100644
--- a/cpp/src/metrics/pairwise_distance_minkowski.cuh
+++ b/cpp/src/metrics/pairwise_distance_minkowski.cuh
@@ -16,7 +16,7 @@
  */
 
 #pragma once
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/handle.hpp>
 
 namespace ML {
@@ -29,7 +29,6 @@ void pairwise_distance_minkowski(const raft::handle_t& handle,
                                  int m,
                                  int n,
                                  int k,
-                                 raft::distance::DistanceType metric,
                                  bool isRowMajor,
                                  double metric_arg);
 
@@ -40,7 +39,6 @@ void pairwise_distance_minkowski(const raft::handle_t& handle,
                                  int m,
                                  int n,
                                  int k,
-                                 raft::distance::DistanceType metric,
                                  bool isRowMajor,
                                  float metric_arg);
 
diff --git a/cpp/src/metrics/pairwise_distance_russell_rao.cu b/cpp/src/metrics/pairwise_distance_russell_rao.cu
new file mode 100644
index 0000000000..b5a8fd4981
--- /dev/null
+++ b/cpp/src/metrics/pairwise_distance_russell_rao.cu
@@ -0,0 +1,57 @@
+
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/distance.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include "pairwise_distance_russell_rao.cuh"
+
+namespace ML {
+
+namespace Metrics {
+void pairwise_distance_russell_rao(const raft::handle_t& handle,
+                                   const double* x,
+                                   const double* y,
+                                   double* dist,
+                                   int m,
+                                   int n,
+                                   int k,
+                                   bool isRowMajor,
+                                   double metric_arg)
+{
+  raft::distance::
+    distance<raft::distance::DistanceType::RusselRaoExpanded, double, double, double, int>(
+      x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
+}
+
+void pairwise_distance_russell_rao(const raft::handle_t& handle,
+                                   const float* x,
+                                   const float* y,
+                                   float* dist,
+                                   int m,
+                                   int n,
+                                   int k,
+                                   bool isRowMajor,
+                                   float metric_arg)
+{
+  raft::distance::
+    distance<raft::distance::DistanceType::RusselRaoExpanded, float, float, float, int>(
+      x, y, dist, m, n, k, handle.get_stream(), isRowMajor);
+}
+
+}  // namespace Metrics
+}  // namespace ML
diff --git a/cpp/src/metrics/pairwise_distance_russell_rao.cuh b/cpp/src/metrics/pairwise_distance_russell_rao.cuh
new file mode 100644
index 0000000000..90f2c64ab8
--- /dev/null
+++ b/cpp/src/metrics/pairwise_distance_russell_rao.cuh
@@ -0,0 +1,47 @@
+
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/distance.hpp>
+#include <raft/handle.hpp>
+
+namespace ML {
+
+namespace Metrics {
+void pairwise_distance_russell_rao(const raft::handle_t& handle,
+                                   const double* x,
+                                   const double* y,
+                                   double* dist,
+                                   int m,
+                                   int n,
+                                   int k,
+                                   bool isRowMajor,
+                                   double metric_arg);
+
+void pairwise_distance_russell_rao(const raft::handle_t& handle,
+                                   const float* x,
+                                   const float* y,
+                                   float* dist,
+                                   int m,
+                                   int n,
+                                   int k,
+                                   bool isRowMajor,
+                                   float metric_arg);
+
+}  // namespace Metrics
+}  // namespace ML
diff --git a/cpp/src/metrics/trustworthiness.cu b/cpp/src/metrics/trustworthiness.cu
index e7ebeb92a4..6eed7759a5 100644
--- a/cpp/src/metrics/trustworthiness.cu
+++ b/cpp/src/metrics/trustworthiness.cu
@@ -18,7 +18,7 @@
 
 #include <cuml/metrics/metrics.hpp>
 
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/handle.hpp>
 
 namespace ML {
diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu
index 11b8d748c5..e5d3c44ac9 100644
--- a/cpp/src/randomforest/randomforest.cu
+++ b/cpp/src/randomforest/randomforest.cu
@@ -31,6 +31,7 @@
 #include <fstream>
 #include <iostream>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 namespace ML {
@@ -242,26 +243,29 @@ std::string get_rf_json(const RandomForestMetaData<T, L>* forest)
 template <class T, class L>
 void build_treelite_forest(ModelHandle* model_handle,
                            const RandomForestMetaData<T, L>* forest,
-                           int num_features,
-                           int task_category)
+                           int num_features)
 {
   auto parent_model          = tl::Model::Create<T, T>();
   tl::ModelImpl<T, T>* model = dynamic_cast<tl::ModelImpl<T, T>*>(parent_model.get());
   ASSERT(model != nullptr, "Invalid downcast to tl::ModelImpl");
 
-  unsigned int num_class;
-  if (task_category > 2) {
-    // Multi-class classification
-    num_class        = task_category;
+  // Determine number of outputs
+  int num_outputs = forest->trees.front()->num_outputs;
+  ASSERT(num_outputs > 0, "Invalid forest");
+  for (const auto& tree : forest->trees) {
+    ASSERT(num_outputs == tree->num_outputs, "Invalid forest");
+  }
+
+  if constexpr (std::is_integral_v<L>) {
+    ASSERT(num_outputs > 1, "More than one variable expected for classification problem.");
     model->task_type = tl::TaskType::kMultiClfProbDistLeaf;
-    std::strcpy(model->param.pred_transform, "max_index");
+    std::strncpy(model->param.pred_transform, "max_index", sizeof(model->param.pred_transform));
   } else {
-    // Binary classification or regression
-    num_class        = 1;
     model->task_type = tl::TaskType::kBinaryClfRegr;
   }
 
-  model->task_param = tl::TaskParam{tl::TaskParam::OutputType::kFloat, false, num_class, num_class};
+  model->task_param = tl::TaskParam{
+    tl::TaskParam::OutputType::kFloat, false, (unsigned int)num_outputs, (unsigned int)num_outputs};
   model->num_feature         = num_features;
   model->average_tree_output = true;
   model->SetTreeLimit(forest->rf_params.n_trees);
@@ -271,7 +275,7 @@ void build_treelite_forest(ModelHandle* model_handle,
     auto rf_tree = forest->trees[i];
 
     if (rf_tree->sparsetree.size() != 0) {
-      model->trees[i] = DT::build_treelite_tree<T, L>(*rf_tree, num_class);
+      model->trees[i] = DT::build_treelite_tree<T, L>(*rf_tree, num_outputs);
     }
   }
 
@@ -771,19 +775,13 @@ template void delete_rf_metadata<double, double>(RandomForestRegressorD* forest)
 
 template void build_treelite_forest<float, int>(ModelHandle* model,
                                                 const RandomForestMetaData<float, int>* forest,
-                                                int num_features,
-                                                int task_category);
+                                                int num_features);
 template void build_treelite_forest<double, int>(ModelHandle* model,
                                                  const RandomForestMetaData<double, int>* forest,
-                                                 int num_features,
-                                                 int task_category);
+                                                 int num_features);
 template void build_treelite_forest<float, float>(ModelHandle* model,
                                                   const RandomForestMetaData<float, float>* forest,
-                                                  int num_features,
-                                                  int task_category);
+                                                  int num_features);
 template void build_treelite_forest<double, double>(
-  ModelHandle* model,
-  const RandomForestMetaData<double, double>* forest,
-  int num_features,
-  int task_category);
+  ModelHandle* model, const RandomForestMetaData<double, double>* forest, int num_features);
 }  // End namespace ML
diff --git a/cpp/src/randomforest/randomforest.cuh b/cpp/src/randomforest/randomforest.cuh
index 159e2d2164..c15331080c 100644
--- a/cpp/src/randomforest/randomforest.cuh
+++ b/cpp/src/randomforest/randomforest.cuh
@@ -162,9 +162,9 @@ class RandomForest {
 #pragma omp parallel for num_threads(n_streams)
     for (int i = 0; i < this->rf_params.n_trees; i++) {
       int stream_id = omp_get_thread_num();
+      auto s        = handle.get_internal_stream(stream_id);
 
-      this->get_row_sample(
-        i, n_rows, &selected_rows[stream_id], handle.get_internal_stream(stream_id));
+      this->get_row_sample(i, n_rows, &selected_rows[stream_id], s);
 
       /* Build individual tree in the forest.
         - input is a pointer to orig data that have n_cols features and n_rows rows.
@@ -176,6 +176,7 @@ class RandomForest {
       */
 
       forest->trees[i] = DT::DecisionTree::fit(handle,
+                                               s,
                                                input,
                                                n_cols,
                                                n_rows,
@@ -226,55 +227,33 @@ class RandomForest {
 
     ML::PatternSetter _("%v");
     for (int row_id = 0; row_id < n_rows; row_id++) {
-      if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG)) {
-        std::stringstream ss;
-        ss << "Predict for sample: ";
-        for (int i = 0; i < n_cols; i++)
-          ss << h_input[row_id * row_size + i] << ", ";
-        CUML_LOG_DEBUG(ss.str().c_str());
+      std::vector<T> row_prediction(forest->trees[0]->num_outputs);
+      for (int i = 0; i < this->rf_params.n_trees; i++) {
+        DT::DecisionTree::predict(user_handle,
+                                  *forest->trees[i],
+                                  &h_input[row_id * row_size],
+                                  1,
+                                  n_cols,
+                                  row_prediction.data(),
+                                  forest->trees[i]->num_outputs,
+                                  verbosity);
+      }
+      for (int k = 0; k < forest->trees[0]->num_outputs; k++) {
+        row_prediction[k] /= this->rf_params.n_trees;
       }
-
       if (rf_type == RF_type::CLASSIFICATION) {  // classification task: use 'majority' prediction
-        std::map<int, int> prediction_to_cnt;
-        std::pair<std::map<int, int>::iterator, bool> ret;
-        int max_cnt_so_far      = 0;
-        int majority_prediction = -1;
-
-        for (int i = 0; i < this->rf_params.n_trees; i++) {
-          L prediction;
-          DT::DecisionTree::predict(user_handle,
-                                    forest->trees[i].get(),
-                                    &h_input[row_id * row_size],
-                                    1,
-                                    n_cols,
-                                    &prediction,
-                                    verbosity);
-          ret = prediction_to_cnt.insert(std::pair<int, int>(prediction, 1));
-          if (!(ret.second)) { ret.first->second += 1; }
-          // Break ties with smaller label
-          if (max_cnt_so_far < ret.first->second ||
-              (max_cnt_so_far == ret.first->second && ret.first->first < majority_prediction)) {
-            max_cnt_so_far      = ret.first->second;
-            majority_prediction = ret.first->first;
+        L best_class = 0;
+        T best_prob  = 0.0;
+        for (int k = 0; k < forest->trees[0]->num_outputs; k++) {
+          if (row_prediction[k] > best_prob) {
+            best_class = k;
+            best_prob  = row_prediction[k];
           }
         }
 
-        h_predictions[row_id] = majority_prediction;
-      } else {  // regression task: use 'average' prediction
-        L sum_predictions = 0;
-        for (int i = 0; i < this->rf_params.n_trees; i++) {
-          L prediction;
-          DT::DecisionTree::predict(user_handle,
-                                    forest->trees[i].get(),
-                                    &h_input[row_id * row_size],
-                                    1,
-                                    n_cols,
-                                    &prediction,
-                                    verbosity);
-          sum_predictions += prediction;
-        }
-        // Random forest's prediction is the arithmetic mean of all its decision tree predictions.
-        h_predictions[row_id] = sum_predictions / this->rf_params.n_trees;
+        h_predictions[row_id] = best_class;
+      } else {
+        h_predictions[row_id] = row_prediction[0];
       }
     }
 
diff --git a/cpp/src/solver/lars_impl.cuh b/cpp/src/solver/lars_impl.cuh
index 3e8c5d2440..9ab45735fd 100644
--- a/cpp/src/solver/lars_impl.cuh
+++ b/cpp/src/solver/lars_impl.cuh
@@ -19,6 +19,7 @@
 #include <iostream>
 #include <limits>
 #include <numeric>
+#include <vector>
 
 #include <raft/cudart_utils.h>
 #include <raft/linalg/cublas_wrappers.h>
diff --git a/cpp/src/umap/runner.cuh b/cpp/src/umap/runner.cuh
index 87fa480ca0..1b1e1f4803 100644
--- a/cpp/src/umap/runner.cuh
+++ b/cpp/src/umap/runner.cuh
@@ -163,6 +163,172 @@ void _fit(const raft::handle_t& handle,
   ML::POP_RANGE();
 }
 
+template <typename value_idx, typename value_t, typename umap_inputs, int TPB_X>
+void _get_graph(const raft::handle_t& handle,
+                const umap_inputs& inputs,
+                UMAPParams* params,
+                raft::sparse::COO<value_t>* cgraph_coo  // assumes single-precision int as the
+                                                        // second template argument for COO
+)
+{
+  ML::PUSH_RANGE("umap::supervised::_get_graph");
+  cudaStream_t stream = handle.get_stream();
+
+  int k = params->n_neighbors;
+
+  ML::Logger::get().setLevel(params->verbosity);
+
+  CUML_LOG_DEBUG("n_neighbors=%d", params->n_neighbors);
+
+  ML::PUSH_RANGE("umap::knnGraph");
+  std::unique_ptr<rmm::device_uvector<value_idx>> knn_indices_b = nullptr;
+  std::unique_ptr<rmm::device_uvector<value_t>> knn_dists_b     = nullptr;
+
+  knn_graph<value_idx, value_t> knn_graph(inputs.n, k);
+
+  /**
+   * If not given precomputed knn graph, compute it
+   */
+  if (inputs.alloc_knn_graph()) {
+    /**
+     * Allocate workspace for kNN graph
+     */
+    knn_indices_b = std::make_unique<rmm::device_uvector<value_idx>>(inputs.n * k, stream);
+    knn_dists_b   = std::make_unique<rmm::device_uvector<value_t>>(inputs.n * k, stream);
+
+    knn_graph.knn_indices = knn_indices_b->data();
+    knn_graph.knn_dists   = knn_dists_b->data();
+  }
+
+  CUML_LOG_DEBUG("Calling knn graph run");
+
+  kNNGraph::run<value_idx, value_t, umap_inputs>(
+    handle, inputs, inputs, knn_graph, k, params, stream);
+  ML::POP_RANGE();
+
+  CUML_LOG_DEBUG("Done. Calling fuzzy simplicial set");
+
+  ML::PUSH_RANGE("umap::simplicial_set");
+  raft::sparse::COO<value_t> rgraph_coo(stream);
+  FuzzySimplSet::run<TPB_X, value_idx, value_t>(
+    inputs.n, knn_graph.knn_indices, knn_graph.knn_dists, k, &rgraph_coo, params, stream);
+
+  CUML_LOG_DEBUG("Done. Calling remove zeros");
+
+  /**
+   * Remove zeros from simplicial set
+   */
+  raft::sparse::op::coo_remove_zeros<TPB_X, value_t>(&rgraph_coo, cgraph_coo, stream);
+  ML::POP_RANGE();
+}
+
+template <typename value_idx, typename value_t, typename umap_inputs, int TPB_X>
+void _get_graph_supervised(
+  const raft::handle_t& handle,
+  const umap_inputs& inputs,
+  UMAPParams* params,
+  raft::sparse::COO<value_t>* cgraph_coo  // assumes single-precision int as the
+                                          // second template argument for COO
+)
+{
+  ML::PUSH_RANGE("umap::supervised::_get_graph_supervised");
+  cudaStream_t stream = handle.get_stream();
+
+  int k = params->n_neighbors;
+
+  ML::Logger::get().setLevel(params->verbosity);
+
+  if (params->target_n_neighbors == -1) params->target_n_neighbors = params->n_neighbors;
+
+  ML::PUSH_RANGE("umap::knnGraph");
+  std::unique_ptr<rmm::device_uvector<value_idx>> knn_indices_b = nullptr;
+  std::unique_ptr<rmm::device_uvector<value_t>> knn_dists_b     = nullptr;
+
+  knn_graph<value_idx, value_t> knn_graph(inputs.n, k);
+
+  /**
+   * If not given precomputed knn graph, compute it
+   */
+  if (inputs.alloc_knn_graph()) {
+    /**
+     * Allocate workspace for kNN graph
+     */
+    knn_indices_b = std::make_unique<rmm::device_uvector<value_idx>>(inputs.n * k, stream);
+    knn_dists_b   = std::make_unique<rmm::device_uvector<value_t>>(inputs.n * k, stream);
+
+    knn_graph.knn_indices = knn_indices_b->data();
+    knn_graph.knn_dists   = knn_dists_b->data();
+  }
+
+  kNNGraph::run<value_idx, value_t, umap_inputs>(
+    handle, inputs, inputs, knn_graph, k, params, stream);
+
+  ML::POP_RANGE();
+
+  /**
+   * Allocate workspace for fuzzy simplicial set.
+   */
+  ML::PUSH_RANGE("umap::simplicial_set");
+  raft::sparse::COO<value_t> rgraph_coo(stream);
+  raft::sparse::COO<value_t> tmp_coo(stream);
+
+  /**
+   * Run Fuzzy simplicial set
+   */
+  // int nnz = n*k*2;
+  FuzzySimplSet::run<TPB_X, value_idx, value_t>(inputs.n,
+                                                knn_graph.knn_indices,
+                                                knn_graph.knn_dists,
+                                                params->n_neighbors,
+                                                &tmp_coo,
+                                                params,
+                                                stream);
+  CUDA_CHECK(cudaPeekAtLastError());
+
+  raft::sparse::op::coo_remove_zeros<TPB_X, value_t>(&tmp_coo, &rgraph_coo, stream);
+
+  /**
+   * If target metric is 'categorical', perform
+   * categorical simplicial set intersection.
+   */
+  if (params->target_metric == ML::UMAPParams::MetricType::CATEGORICAL) {
+    CUML_LOG_DEBUG("Performing categorical intersection");
+    Supervised::perform_categorical_intersection<TPB_X, value_t>(
+      inputs.y, &rgraph_coo, cgraph_coo, params, stream);
+
+    /**
+     * Otherwise, perform general simplicial set intersection
+     */
+  } else {
+    CUML_LOG_DEBUG("Performing general intersection");
+    Supervised::perform_general_intersection<TPB_X, value_idx, value_t>(
+      handle, inputs.y, &rgraph_coo, cgraph_coo, params, stream);
+  }
+
+  /**
+   * Remove zeros
+   */
+  raft::sparse::op::coo_sort<value_t>(cgraph_coo, stream);
+
+  raft::sparse::COO<value_t> ocoo(stream);
+  raft::sparse::op::coo_remove_zeros<TPB_X, value_t>(cgraph_coo, &ocoo, stream);
+  ML::POP_RANGE();
+}
+
+template <typename value_idx, typename value_t, typename umap_inputs, int TPB_X>
+void _refine(const raft::handle_t& handle,
+             const umap_inputs& inputs,
+             UMAPParams* params,
+             raft::sparse::COO<value_t>* cgraph_coo,
+             value_t* embeddings)
+{
+  cudaStream_t stream = handle.get_stream();
+  /**
+   * Run simplicial set embedding to approximate low-dimensional representation
+   */
+  SimplSetEmbed::run<TPB_X, value_t>(inputs.n, inputs.d, cgraph_coo, params, embeddings, stream);
+}
+
 template <typename value_idx, typename value_t, typename umap_inputs, int TPB_X>
 void _fit_supervised(const raft::handle_t& handle,
                      const umap_inputs& inputs,
diff --git a/cpp/src/umap/umap.cu b/cpp/src/umap/umap.cu
index dc31d9af37..bf23e786cb 100644
--- a/cpp/src/umap/umap.cu
+++ b/cpp/src/umap/umap.cu
@@ -136,6 +136,43 @@ void fit(const raft::handle_t& handle,
   }
 }
 
+// get graph
+std::unique_ptr<raft::sparse::COO<float, int>> get_graph(const raft::handle_t& handle,
+                                                         float* X,  // input matrix
+                                                         float* y,  // labels
+                                                         int n,
+                                                         int d,
+                                                         UMAPParams* params)
+{
+  manifold_dense_inputs_t<float> inputs(X, y, n, d);
+  auto cgraph_coo = std::make_unique<raft::sparse::COO<float>>(handle.get_stream());
+  if (y != nullptr) {
+    UMAPAlgo::
+      _get_graph_supervised<knn_indices_dense_t, float, manifold_dense_inputs_t<float>, TPB_X>(
+        handle, inputs, params, cgraph_coo.get());
+  } else {
+    UMAPAlgo::_get_graph<knn_indices_dense_t, float, manifold_dense_inputs_t<float>, TPB_X>(
+      handle, inputs, params, cgraph_coo.get());
+  }
+
+  return cgraph_coo;
+}
+
+// refine
+void refine(const raft::handle_t& handle,
+            float* X,  // input matrix
+            int n,
+            int d,
+            raft::sparse::COO<float>* cgraph_coo,
+            UMAPParams* params,
+            float* embeddings)
+{
+  CUML_LOG_DEBUG("Calling UMAP::refine() with precomputed KNN");
+  manifold_dense_inputs_t<float> inputs(X, nullptr, n, d);
+  UMAPAlgo::_refine<knn_indices_dense_t, float, manifold_dense_inputs_t<float>, TPB_X>(
+    handle, inputs, params, cgraph_coo, embeddings);
+}
+
 // Sparse fit
 void fit_sparse(const raft::handle_t& handle,
                 int* indptr,  // input matrix
diff --git a/cpp/src_prims/linalg/block.cuh b/cpp/src_prims/linalg/block.cuh
index 81fc9ac34a..cd8502cebf 100644
--- a/cpp/src_prims/linalg/block.cuh
+++ b/cpp/src_prims/linalg/block.cuh
@@ -70,19 +70,16 @@ namespace MLCommon {
 namespace LinAlg {
 
 /**
- * Execution policy for a block-local GEMM
+ * Generic block policy, that can be inherited by more specific policies.
+ * Describes the shape of a tile worked by a thread block.
  *
- * @tparam _veclen Length for vectorized loads (1 or 2 for fp64 + 4 for fp32)
- * @tparam _kblk   Tile dimension k
  * @tparam _rpt    Rows worked per thread
  * @tparam _cpt    Columns worked per thread
  * @tparam _tr     Number of thread rows
  * @tparam _tc     Number of thread columns
  */
-template <int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
-struct BlockGemmPolicy {
-  /** Length for vectorized loads */
-  static constexpr int VecLen = _veclen;
+template <int _rpt, int _cpt, int _tr, int _tc>
+struct BlockPolicy {
   /** Rows worked per thread */
   static constexpr int RowsPerTh = _rpt;
   /** Columns worked per thread */
@@ -93,46 +90,66 @@ struct BlockGemmPolicy {
   static constexpr int ThRows = _tr;
   /** Number of thread columns */
   static constexpr int ThCols = _tc;
-  /** Tile dimension k */
-  static constexpr int Kblk = _kblk;
   /** Tile dimension m */
   static constexpr int Mblk = RowsPerTh * ThRows;
   /** Tile dimension n */
   static constexpr int Nblk = ColsPerTh * ThCols;
+  /** Total size of a tile */
+  static constexpr int TileSize = Mblk * Nblk;
   /** Number of threads per block */
   static constexpr int BlockSize = ThRows * ThCols;
+};
+
+/**
+ * Execution policy for a block-local GEMM
+ *
+ * @tparam _veclen Length for vectorized loads (1 or 2 for fp64 + 4 for fp32)
+ * @tparam _kblk   Tile dimension k
+ * @tparam _rpt    Rows worked per thread
+ * @tparam _cpt    Columns worked per thread
+ * @tparam _tr     Number of thread rows
+ * @tparam _tc     Number of thread columns
+ */
+template <int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
+struct BlockGemmPolicy : BlockPolicy<_rpt, _cpt, _tr, _tc> {
+  using Base = BlockPolicy<_rpt, _cpt, _tr, _tc>;
+
+  /** Length for vectorized loads */
+  static constexpr int VecLen = _veclen;
+  /** Tile dimension k */
+  static constexpr int Kblk = _kblk;
 
   /** Number of threads required to load a single column of the A tile */
-  static constexpr int AN_LdRows = Mblk / VecLen;
+  static constexpr int AN_LdRows = Base::Mblk / VecLen;
   /** Number of threads required to load a single row of the A' tile */
   static constexpr int AT_LdRows = Kblk / VecLen;
   /** Number of threads required to load a single column of the B tile */
   static constexpr int BN_LdRows = Kblk / VecLen;
   /** Number of threads required to load a single row of the B' tile */
-  static constexpr int BT_LdRows = Nblk / VecLen;
+  static constexpr int BT_LdRows = Base::Nblk / VecLen;
 
   /* Check that the block size is a multiple of LdRows, i.e one load
    * with the whole block corresponds to a number of full columns */
-  static_assert(BlockSize % AN_LdRows == 0);
-  static_assert(BlockSize % AT_LdRows == 0);
-  static_assert(BlockSize % BN_LdRows == 0);
-  static_assert(BlockSize % BT_LdRows == 0);
+  static_assert(Base::BlockSize % AN_LdRows == 0);
+  static_assert(Base::BlockSize % AT_LdRows == 0);
+  static_assert(Base::BlockSize % BN_LdRows == 0);
+  static_assert(Base::BlockSize % BT_LdRows == 0);
 
   /** Number of columns of the A tile in one load with the whole block */
-  static constexpr int AN_LdCols = BlockSize / AN_LdRows;
+  static constexpr int AN_LdCols = Base::BlockSize / AN_LdRows;
   /** Number of rows of the A' tile in one load with the whole block */
-  static constexpr int AT_LdCols = BlockSize / AT_LdRows;
+  static constexpr int AT_LdCols = Base::BlockSize / AT_LdRows;
   /** Number of columns of the B tile in one load with the whole block */
-  static constexpr int BN_LdCols = BlockSize / BN_LdRows;
+  static constexpr int BN_LdCols = Base::BlockSize / BN_LdRows;
   /** Number of rows of the B' tile in one load with the whole block */
-  static constexpr int BT_LdCols = BlockSize / BT_LdRows;
+  static constexpr int BT_LdCols = Base::BlockSize / BT_LdRows;
 
   /* Number of loads per thread necessary to load the A tile */
   static constexpr int AN_LdCount = Kblk / AN_LdCols;
   /* Number of loads per thread necessary to load the A' tile */
-  static constexpr int AT_LdCount = Mblk / AT_LdCols;
+  static constexpr int AT_LdCount = Base::Mblk / AT_LdCols;
   /* Number of loads per thread necessary to load the B tile */
-  static constexpr int BN_LdCount = Nblk / BN_LdCols;
+  static constexpr int BN_LdCount = Base::Nblk / BN_LdCols;
   /* Number of loads per thread necessary to load the B' tile */
   static constexpr int BT_LdCount = Kblk / BT_LdCols;
 };
@@ -173,6 +190,15 @@ struct GemvStorage {
   T acc[GemvPolicy::BlockSize];
 };
 
+/**
+ * Structure to hold the shared memory used by covariance numerical stability operation
+ */
+template <typename CovStabilityPolicy, typename T>
+struct CovStabilityStorage {
+  /** Transposed tile */
+  T tile[CovStabilityPolicy::TileSize];
+};
+
 /**
  * Structure to hold the shared memory used by a block reduction
  */
@@ -495,5 +521,63 @@ DI T _block_xAxt(
   return _block_reduce<BlockSize, Broadcast>(acc, reduction_storage);
 }
 
+/**
+ * @brief Improves numerical accuracy by making sure that the covariance matrix
+ *        is symmetric and only has positive elements along the diagonal.
+ *
+ * @todo: solve bank conflicts
+ *
+ * @tparam     CovPolicy   Execution policy
+ * @tparam     T           Floating-point type
+ * @tparam     StorageT    Shared memory storage structure type
+ * @param[in]  n           Matrix size
+ * @param[in]  in          Input covariance matrix
+ * @param[out] out         Output covariance matrix
+ * @param[in]  cov_storage Temporary shared memory storage
+ */
+template <typename CovPolicy, typename T, typename StorageT>
+DI void _block_covariance_stability(int n, const T* in, T* out, StorageT& cov_storage)
+{
+  int th_off_i = threadIdx.x % CovPolicy::ThRows;
+  int th_off_j = threadIdx.x / CovPolicy::ThRows;
+
+  /* Loop over tiles */
+  for (int blk_j = 0; blk_j < raft::ceildiv<int>(n, CovPolicy::Nblk); blk_j++) {
+    for (int blk_i = 0; blk_i < raft::ceildiv<int>(n, CovPolicy::Mblk); blk_i++) {
+      // Load the tile of the transpose matrix into a N x M shared memory tile
+      _load_tile<false,
+                 CovPolicy::BlockSize,
+                 1,
+                 CovPolicy::Nblk,
+                 CovPolicy::BlockSize / CovPolicy::Nblk,
+                 CovPolicy::RowsPerTh * CovPolicy::ColsPerTh,
+                 CovPolicy::Nblk,
+                 CovPolicy::Mblk>(
+        in, cov_storage.tile, blk_j * CovPolicy::Nblk, blk_i * CovPolicy::Mblk, n, n);
+      __syncthreads();
+
+      // Read from matrix and transposed tile, write to output matrix
+#pragma unroll
+      for (int th_j = 0; th_j < CovPolicy::ColsPerTh; th_j++) {
+#pragma unroll
+        for (int th_i = 0; th_i < CovPolicy::RowsPerTh; th_i++) {
+          int i  = th_off_i + th_i * CovPolicy::ThRows;
+          int j  = th_off_j + th_j * CovPolicy::ThCols;
+          int gi = blk_i * CovPolicy::Mblk + i;
+          int gj = blk_j * CovPolicy::Nblk + j;
+
+          if (gi < n && gj < n) {
+            T in0            = in[gj * n + gi];
+            T in1            = cov_storage.tile[i * CovPolicy::Nblk + j];
+            out[gj * n + gi] = gi == gj ? abs(in0) : 0.5 * (in0 + in1);
+          }
+        }
+      }
+
+      __syncthreads();
+    }
+  }
+}
+
 }  // namespace LinAlg
 }  // namespace MLCommon
diff --git a/cpp/src_prims/linalg/lstsq.cuh b/cpp/src_prims/linalg/lstsq.cuh
index 8dd0793e61..abc143003e 100644
--- a/cpp/src_prims/linalg/lstsq.cuh
+++ b/cpp/src_prims/linalg/lstsq.cuh
@@ -21,8 +21,10 @@
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/linalg/gemv.h>
 #include <raft/linalg/transpose.h>
+#include <common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/eig.cuh>
+#include <raft/linalg/eltwise.cuh>
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/qr.cuh>
 #include <raft/linalg/svd.cuh>
@@ -30,65 +32,322 @@
 #include <raft/matrix/matrix.cuh>
 #include <raft/mr/device/buffer.hpp>
 #include <raft/random/rng.cuh>
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace MLCommon {
 namespace LinAlg {
 
+namespace {
+
+/** Operate a CUDA event if we're in the concurrent mode; no-op otherwise. */
+struct DeviceEvent {
+ private:
+  cudaEvent_t e;
+
+ public:
+  DeviceEvent(bool concurrent)
+  {
+    if (concurrent)
+      CUDA_CHECK(cudaEventCreate(&e));
+    else
+      e = nullptr;
+  }
+  ~DeviceEvent()
+  {
+    if (e != nullptr) CUDA_CHECK_NO_THROW(cudaEventDestroy(e));
+  }
+  operator cudaEvent_t() const { return e; }
+  void record(cudaStream_t stream)
+  {
+    if (e != nullptr) CUDA_CHECK(cudaEventRecord(e, stream));
+  }
+  void wait(cudaStream_t stream)
+  {
+    if (e != nullptr) CUDA_CHECK(cudaStreamWaitEvent(stream, e, 0u));
+  }
+  void wait()
+  {
+    if (e != nullptr) CUDA_CHECK(cudaEventSynchronize(e));
+  }
+  DeviceEvent& operator=(const DeviceEvent& other) = delete;
+};
+
+/**
+ *  @brief Tells if the viewed CUDA stream is implicitly synchronized with the given stream.
+ *
+ *  This can happen e.g.
+ *   if the two views point to the same stream
+ *   or sometimes when one of them is the legacy default stream.
+ */
+bool are_implicitly_synchronized(rmm::cuda_stream_view a, rmm::cuda_stream_view b)
+{
+  // any stream is "synchronized" with itself
+  if (a.value() == b.value()) return true;
+  // legacy + blocking streams
+  unsigned int flags = 0;
+  if (a.is_default()) {
+    CUDA_CHECK(cudaStreamGetFlags(b.value(), &flags));
+    if ((flags & cudaStreamNonBlocking) == 0) return true;
+  }
+  if (b.is_default()) {
+    CUDA_CHECK(cudaStreamGetFlags(a.value(), &flags));
+    if ((flags & cudaStreamNonBlocking) == 0) return true;
+  }
+  return false;
+}
+
 template <typename math_t>
-void lstsq(const raft::handle_t& handle,
-           math_t* A,
-           int n_rows,
-           int n_cols,
-           math_t* b,
-           math_t* w,
-           int algo,
-           cudaStream_t stream)
+struct DivideByNonZero {
+  constexpr static const math_t eps = math_t(1e-10);
+
+  __device__ math_t operator()(const math_t a, const math_t b) const
+  {
+    return raft::myAbs<math_t>(b) >= eps ? a / b : a;
+  }
+};
+
+}  // namespace
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  Via SVD decomposition of `A = U S Vt` using default cuSOLVER routine.
+ *
+ *  @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines,
+ *             so it's not guaranteed to stay unmodified.
+ */
+template <typename math_t>
+void lstsqSvdQR(const raft::handle_t& handle,
+                math_t* A,
+                const int n_rows,
+                const int n_cols,
+                const math_t* b,
+                math_t* w,
+                cudaStream_t stream)
 {
+  const int minmn              = min(n_rows, n_cols);
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
-  cublasHandle_t cublasH       = handle.get_cublas_handle();
-
-  ASSERT(n_rows > 1, "lstsq: number of rows cannot be less than two");
+  int cusolverWorkSetSize      = 0;
+  CUSOLVER_CHECK(raft::linalg::cusolverDngesvd_bufferSize<math_t>(
+    cusolverH, n_rows, n_cols, &cusolverWorkSetSize));
 
-  size_t U_len = n_rows * n_cols;
-  size_t V_len = n_cols * n_cols;
+  rmm::device_uvector<math_t> workset(cusolverWorkSetSize  // cuSolver
+                                        + n_rows * minmn   // U
+                                        + n_cols * n_cols  // V
+                                        + minmn            // S
+                                        + minmn            // U^T * b
+                                        + 1                // devInfo
+                                      ,
+                                      stream);
+  math_t* cusolverWorkSet = workset.data();
+  math_t* U               = cusolverWorkSet + cusolverWorkSetSize;
+  math_t* Vt              = U + n_rows * minmn;
+  math_t* S               = Vt + n_cols * n_cols;
+  math_t* Ub              = S + minmn;
+  int* devInfo            = reinterpret_cast<int*>(Ub + minmn);
 
-  rmm::device_uvector<math_t> S(n_cols, stream);
-  rmm::device_uvector<math_t> V(V_len, stream);
-  rmm::device_uvector<math_t> U(U_len, stream);
+  CUSOLVER_CHECK(raft::linalg::cusolverDngesvd<math_t>(cusolverH,
+                                                       'S',
+                                                       'S',
+                                                       n_rows,
+                                                       n_cols,
+                                                       A,
+                                                       n_rows,
+                                                       S,
+                                                       U,
+                                                       n_rows,
+                                                       Vt,
+                                                       n_cols,
+                                                       cusolverWorkSet,
+                                                       cusolverWorkSetSize,
+                                                       nullptr,
+                                                       devInfo,
+                                                       stream));
+  raft::linalg::gemv(handle, U, n_rows, minmn, b, Ub, true, stream);
+  raft::linalg::binaryOp(Ub, Ub, S, minmn, DivideByNonZero<math_t>(), stream);
+  raft::linalg::gemv(handle, Vt, minmn, n_cols, n_cols, Ub, w, true, stream);
+}
 
-  // we use a temporary vector to avoid doing re-using w in the last step, the
-  // gemv, which could cause a very sporadic race condition in Pascal and
-  // Turing GPUs that caused it to give the wrong results. Details:
-  // https://github.com/rapidsai/cuml/issues/1739
-  rmm::device_uvector<math_t> tmp_vector(n_cols, stream);
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  Via SVD decomposition of `A = U S V^T` using Jacobi iterations (cuSOLVER).
+ *
+ *  @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines,
+ *             so it's not guaranteed to stay unmodified.
+ */
+template <typename math_t>
+void lstsqSvdJacobi(const raft::handle_t& handle,
+                    math_t* A,
+                    const int n_rows,
+                    const int n_cols,
+                    const math_t* b,
+                    math_t* w,
+                    cudaStream_t stream)
+{
+  const int minmn = min(n_rows, n_cols);
+  gesvdjInfo_t gesvdj_params;
+  CUSOLVER_CHECK(cusolverDnCreateGesvdjInfo(&gesvdj_params));
+  int cusolverWorkSetSize      = 0;
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize<math_t>(cusolverH,
+                                                                   CUSOLVER_EIG_MODE_VECTOR,
+                                                                   1,
+                                                                   n_rows,
+                                                                   n_cols,
+                                                                   A,
+                                                                   n_rows,
+                                                                   nullptr,
+                                                                   nullptr,
+                                                                   n_rows,
+                                                                   nullptr,
+                                                                   n_cols,
+                                                                   &cusolverWorkSetSize,
+                                                                   gesvdj_params));
+  rmm::device_uvector<math_t> workset(cusolverWorkSetSize  // cuSolver
+                                        + n_rows * minmn   // U
+                                        + n_cols * minmn   // V
+                                        + minmn            // S
+                                        + minmn            // U^T * b
+                                        + 1                // devInfo
+                                      ,
+                                      stream);
+  math_t* cusolverWorkSet = workset.data();
+  math_t* U               = cusolverWorkSet + cusolverWorkSetSize;
+  math_t* V               = U + n_rows * minmn;
+  math_t* S               = V + n_cols * minmn;
+  math_t* Ub              = S + minmn;
+  int* devInfo            = reinterpret_cast<int*>(Ub + minmn);
+  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj<math_t>(cusolverH,
+                                                        CUSOLVER_EIG_MODE_VECTOR,
+                                                        1,
+                                                        n_rows,
+                                                        n_cols,
+                                                        A,
+                                                        n_rows,
+                                                        S,
+                                                        U,
+                                                        n_rows,
+                                                        V,
+                                                        n_cols,
+                                                        cusolverWorkSet,
+                                                        cusolverWorkSetSize,
+                                                        devInfo,
+                                                        gesvdj_params,
+                                                        stream));
+  raft::linalg::gemv(handle, U, n_rows, minmn, b, Ub, true, stream);
+  raft::linalg::binaryOp(Ub, Ub, S, minmn, DivideByNonZero<math_t>(), stream);
+  raft::linalg::gemv(handle, V, n_cols, minmn, Ub, w, false, stream);
+}
 
-  if (algo == 0 || n_cols == 1) {
-    raft::linalg::svdQR(
-      handle, A, n_rows, n_cols, S.data(), U.data(), V.data(), true, true, true, stream);
-  } else if (algo == 1) {
-    raft::linalg::svdEig(handle, A, n_rows, n_cols, S.data(), U.data(), V.data(), true, stream);
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  via eigenvalue decomposition of `A^T * A` (covariance matrix for dataset A).
+ *  (`w = (A^T A)^-1  A^T b`)
+ */
+template <typename math_t>
+void lstsqEig(const raft::handle_t& handle,
+              const math_t* A,
+              const int n_rows,
+              const int n_cols,
+              const math_t* b,
+              math_t* w,
+              cudaStream_t stream)
+{
+  rmm::cuda_stream_view mainStream   = rmm::cuda_stream_view(stream);
+  rmm::cuda_stream_view multAbStream = mainStream;
+  bool concurrent                    = false;
+  {
+    int sp_size = handle.get_num_internal_streams();
+    if (sp_size > 0) {
+      multAbStream = handle.get_internal_stream_view(0);
+      // check if the two streams can run concurrently
+      if (!are_implicitly_synchronized(mainStream, multAbStream)) {
+        concurrent = true;
+      } else if (sp_size > 1) {
+        mainStream   = multAbStream;
+        multAbStream = handle.get_internal_stream_view(1);
+        concurrent   = true;
+      }
+    }
   }
+  // the event is created only if the given raft handle is capable of running
+  // at least two CUDA streams without implicit synchronization.
+  DeviceEvent multAbDone(concurrent);
+
+  rmm::device_uvector<math_t> workset(n_cols * n_cols * 3 + n_cols * 2, mainStream);
+  math_t* Q    = workset.data();
+  math_t* QS   = Q + n_cols * n_cols;
+  math_t* covA = QS + n_cols * n_cols;
+  math_t* S    = covA + n_cols * n_cols;
+  math_t* Ab   = S + n_cols;
+
+  // covA <- A* A
+  math_t alpha = math_t(1);
+  math_t beta  = math_t(0);
+  raft::linalg::gemm(handle,
+                     A,
+                     n_rows,
+                     n_cols,
+                     A,
+                     covA,
+                     n_cols,
+                     n_cols,
+                     CUBLAS_OP_T,
+                     CUBLAS_OP_N,
+                     alpha,
+                     beta,
+                     mainStream);
 
-  raft::linalg::gemv(handle, U.data(), n_rows, n_cols, b, tmp_vector.data(), true, stream);
+  // Ab <- A* b
+  raft::linalg::gemv(handle, A, n_rows, n_cols, b, Ab, true, multAbStream);
+  multAbDone.record(multAbStream);
 
-  raft::matrix::matrixVectorBinaryDivSkipZero(
-    tmp_vector.data(), S.data(), 1, n_cols, false, true, stream);
+  // Q S Q* <- covA
+  ML::PUSH_RANGE("Trace::MLCommon::LinAlg::lstsq::eigDC", mainStream);
+  raft::linalg::eigDC(handle, covA, n_cols, n_cols, Q, S, mainStream);
+  ML::POP_RANGE(mainStream);
 
-  raft::linalg::gemv(handle, V.data(), n_cols, n_cols, tmp_vector.data(), w, false, stream);
+  // QS  <- Q invS
+  raft::linalg::matrixVectorOp(
+    QS, Q, S, n_cols, n_cols, false, true, DivideByNonZero<math_t>(), mainStream);
+  // covA <- QS Q* == Q invS Q* == inv(A* A)
+  raft::linalg::gemm(handle,
+                     QS,
+                     n_cols,
+                     n_cols,
+                     Q,
+                     covA,
+                     n_cols,
+                     n_cols,
+                     CUBLAS_OP_N,
+                     CUBLAS_OP_T,
+                     alpha,
+                     beta,
+                     mainStream);
+  multAbDone.wait(mainStream);
+  // w <- covA Ab == Q invS Q* A b == inv(A* A) A b
+  raft::linalg::gemv(handle, covA, n_cols, n_cols, Ab, w, false, mainStream);
 }
 
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  via QR decomposition of `A = QR`.
+ *  (triangular system of equations `Rw = Q^T b`)
+ *
+ * @param A[in/out] - input feature matrix.
+ *            Warning: the content of this matrix is modified by the cuSOLVER routines.
+ * @param b[in/out] - input target vector.
+ *            Warning: the content of this vector is modified by the cuSOLVER routines.
+ */
 template <typename math_t>
-void lstsqQR(math_t* A,
-             int n_rows,
-             int n_cols,
+void lstsqQR(const raft::handle_t& handle,
+             math_t* A,
+             const int n_rows,
+             const int n_cols,
              math_t* b,
              math_t* w,
-             cusolverDnHandle_t cusolverH,
-             cublasHandle_t cublasH,
              cudaStream_t stream)
 {
+  cublasHandle_t cublasH       = handle.get_cublas_handle();
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+
   int m = n_rows;
   int n = n_cols;
 
@@ -172,6 +431,4 @@ void lstsqQR(math_t* A,
 }
 
 };  // namespace LinAlg
-// end namespace LinAlg
 };  // namespace MLCommon
-// end namespace MLCommon
diff --git a/cpp/src_prims/matrix/grammatrix.cuh b/cpp/src_prims/matrix/grammatrix.cuh
index 8972161385..75a90ee47d 100644
--- a/cpp/src_prims/matrix/grammatrix.cuh
+++ b/cpp/src_prims/matrix/grammatrix.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/linalg/cublas_wrappers.h>
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/linalg/gemm.cuh>
 
 namespace MLCommon {
diff --git a/cpp/src_prims/matrix/kernelmatrices.cuh b/cpp/src_prims/matrix/kernelmatrices.cuh
index 3f5df1b7f0..6c45e24d96 100644
--- a/cpp/src_prims/matrix/kernelmatrices.cuh
+++ b/cpp/src_prims/matrix/kernelmatrices.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/linalg/gemm.cuh>
 #include "grammatrix.cuh"
 
diff --git a/cpp/src_prims/metrics/scores.cuh b/cpp/src_prims/metrics/scores.cuh
index 14ea268a50..270b940673 100644
--- a/cpp/src_prims/metrics/scores.cuh
+++ b/cpp/src_prims/metrics/scores.cuh
@@ -21,7 +21,7 @@
 #include <thrust/reduce.h>
 #include <linalg/power.cuh>
 #include <memory>
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/linalg/eltwise.cuh>
 #include <raft/linalg/subtract.cuh>
 #include <raft/spatial/knn/knn.hpp>
diff --git a/cpp/src_prims/metrics/silhouette_score.cuh b/cpp/src_prims/metrics/silhouette_score.cuh
index c10c7f2cfc..5c6a1c959c 100644
--- a/cpp/src_prims/metrics/silhouette_score.cuh
+++ b/cpp/src_prims/metrics/silhouette_score.cuh
@@ -25,7 +25,7 @@
 #include <linalg/reduce_cols_by_key.cuh>
 #include <numeric>
 #include <raft/cuda_utils.cuh>
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/linalg/binary_op.cuh>
 #include <raft/linalg/eltwise.cuh>
 #include <raft/linalg/map_then_reduce.cuh>
diff --git a/cpp/src_prims/selection/knn.cuh b/cpp/src_prims/selection/knn.cuh
index fd484daefb..1b03835376 100644
--- a/cpp/src_prims/selection/knn.cuh
+++ b/cpp/src_prims/selection/knn.cuh
@@ -26,7 +26,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/mr/device/allocator.hpp>
 
 #include <faiss/gpu/GpuDistance.h>
diff --git a/cpp/src_prims/timeSeries/fillna.cuh b/cpp/src_prims/timeSeries/fillna.cuh
new file mode 100644
index 0000000000..b67e3617db
--- /dev/null
+++ b/cpp/src_prims/timeSeries/fillna.cuh
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cub/cub.cuh>
+
+#include <cuml/tsa/arima_common.h>
+#include <raft/cudart_utils.h>
+#include <linalg/batched/matrix.cuh>
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/mr/device/allocator.hpp>
+#include <rmm/device_uvector.hpp>
+#include "jones_transform.cuh"
+
+// Auxiliary functions in anonymous namespace
+namespace {
+
+struct FillnaTemp {
+  /** After the scan, this index refers to the position of the last valid value */
+  int index;
+  /** This indicates whether a value is valid, i.e != NaN */
+  bool is_valid;
+  /** This indicates that this position is the first of a series and values from the previous series
+   * in the batch cannot be used to fill missing observations */
+  bool is_first;
+};
+
+// Functor for making the temp object from an index
+template <bool forward, typename T>
+struct FillnaTempMaker {
+  const T* data;
+  int batch_size;
+  int n_obs;
+
+  __host__ __device__ FillnaTempMaker(const T* data_, int batch_size_, int n_obs_)
+    : data(data_), batch_size(batch_size_), n_obs(n_obs_)
+  {
+  }
+
+  __host__ __device__ __forceinline__ FillnaTemp operator()(const int& index) const
+  {
+    if (forward)
+      return {index, !isnan(data[index]), index % n_obs == 0};
+    else {
+      int index_bwd = batch_size * n_obs - 1 - index;
+      return {index_bwd, !isnan(data[index_bwd]), index % n_obs == 0};
+    }
+  }
+};
+
+struct FillnaOp {
+  __host__ __device__ __forceinline__ FillnaTemp operator()(const FillnaTemp& lhs,
+                                                            const FillnaTemp& rhs) const
+  {
+    return (rhs.is_first || rhs.is_valid) ? rhs : lhs;
+  }
+};
+
+template <bool forward, typename T>
+__global__ void fillna_interpolate_kernel(T* data,
+                                          int n_elem,
+                                          FillnaTemp* d_indices_fwd,
+                                          FillnaTemp* d_indices_bwd)
+{
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < n_elem;
+       index += gridDim.x * blockDim.x) {
+    if (isnan(data[index])) {
+      FillnaTemp fwd = d_indices_fwd[index];
+      FillnaTemp bwd = d_indices_bwd[n_elem - 1 - index];
+      T value_fwd    = data[fwd.index];
+      T value_bwd    = data[bwd.index];
+
+      if (!fwd.is_valid) {
+        data[index] = value_bwd;
+      } else if (!bwd.is_valid) {
+        data[index] = value_fwd;
+      } else {
+        T coef      = (T)(index - fwd.index) / (T)(bwd.index - fwd.index);
+        data[index] = ((T)1 - coef) * value_fwd + coef * value_bwd;
+      }
+    }
+  }
+}
+
+}  // namespace
+
+namespace MLCommon {
+namespace TimeSeries {
+
+/**
+ * Fill NaN values by interpolating between the last and next valid values
+ *
+ * @param[inout] data       Data which will be processed in-place
+ * @param[in]    batch_size Number of series in the batch
+ * @param[in]    n_obs      Number of observations per series
+ * @param[in]    stream     CUDA stream
+ */
+template <typename T>
+void fillna(T* data, int batch_size, int n_obs, cudaStream_t stream)
+{
+  rmm::device_uvector<FillnaTemp> indices_fwd(batch_size * n_obs, stream);
+  rmm::device_uvector<FillnaTemp> indices_bwd(batch_size * n_obs, stream);
+  FillnaTempMaker<true, T> transform_op_fwd(data, batch_size, n_obs);
+  FillnaTempMaker<false, T> transform_op_bwd(data, batch_size, n_obs);
+  cub::CountingInputIterator<int> counting(0);
+  FillnaOp scan_op;
+
+  // Iterators wrapping the data with metadata (valid, first of its series)
+  cub::TransformInputIterator<FillnaTemp, FillnaTempMaker<true, T>, cub::CountingInputIterator<int>>
+    itr_fwd(counting, transform_op_fwd);
+  cub::
+    TransformInputIterator<FillnaTemp, FillnaTempMaker<false, T>, cub::CountingInputIterator<int>>
+      itr_bwd(counting, transform_op_bwd);
+
+  // Allocate temporary storage
+  size_t temp_storage_bytes = 0;
+  cub::DeviceScan::InclusiveScan(
+    nullptr, temp_storage_bytes, itr_fwd, indices_fwd.data(), scan_op, batch_size * n_obs, stream);
+  rmm::device_uvector<char> temp_storage(temp_storage_bytes, stream);
+  void* d_temp_storage = (void*)temp_storage.data();
+
+  // Execute scan (forward)
+  cub::DeviceScan::InclusiveScan(d_temp_storage,
+                                 temp_storage_bytes,
+                                 itr_fwd,
+                                 indices_fwd.data(),
+                                 scan_op,
+                                 batch_size * n_obs,
+                                 stream);
+
+  // Execute scan (backward)
+  cub::DeviceScan::InclusiveScan(d_temp_storage,
+                                 temp_storage_bytes,
+                                 itr_bwd,
+                                 indices_bwd.data(),
+                                 scan_op,
+                                 batch_size * n_obs,
+                                 stream);
+
+  const int TPB      = 256;
+  const int n_blocks = raft::ceildiv<int>(n_obs * batch_size, TPB);
+
+  // Interpolate valid values
+  fillna_interpolate_kernel<false><<<n_blocks, TPB, 0, stream>>>(
+    data, batch_size * n_obs, indices_fwd.data(), indices_bwd.data());
+  CUDA_CHECK(cudaGetLastError());
+}
+
+}  // namespace TimeSeries
+}  // namespace MLCommon
\ No newline at end of file
diff --git a/cpp/src_prims/timeSeries/jones_transform.cuh b/cpp/src_prims/timeSeries/jones_transform.cuh
index d23549a99d..67afef52de 100644
--- a/cpp/src_prims/timeSeries/jones_transform.cuh
+++ b/cpp/src_prims/timeSeries/jones_transform.cuh
@@ -51,9 +51,10 @@ struct PAC {
 * @param tmp: the temporary array used in transformation
 * @param myNewParams: will contain the transformed params
 * @param isAr: tell the type of transform (if ar or ma transform)
+* @param clamp: whether to clamp transformed params between -1 and 1
 */
 template <typename DataT, typename IdxT, int VALUE>
-inline __device__ void transform(DataT* tmp, DataT* myNewParams, bool isAr)
+inline __device__ void transform(DataT* tmp, DataT* myNewParams, bool isAr, bool clamp)
 {
   // do the ar transformation
   PAC<DataT> pac;
@@ -86,6 +87,13 @@ inline __device__ void transform(DataT* tmp, DataT* myNewParams, bool isAr)
       }
     }
   }
+
+  if (clamp) {
+    // Clamp values to avoid numerical issues when very close to 1
+    for (int i = 0; i < VALUE; ++i) {
+      myNewParams[i] = max(-0.9999, min(myNewParams[i], 0.9999));
+    }
+  }
 }
 
 /**
@@ -146,10 +154,11 @@ inline __device__ void invtransform(DataT* tmp, DataT* myNewParams, bool isAr)
  * @param batchSize: number of models in a batch
  * @param isAr: if the coefficients to be transformed are Autoregressive or moving average
  * @param isInv: if the transformation type is regular or inverse
+ * @param clamp: whether to clamp transformed params between -1 and 1
  */
 template <typename DataT, int VALUE, typename IdxT, int BLOCK_DIM_X, int BLOCK_DIM_Y>
 __global__ void jones_transform_kernel(
-  DataT* newParams, const DataT* params, IdxT batchSize, bool isAr, bool isInv)
+  DataT* newParams, const DataT* params, IdxT batchSize, bool isAr, bool isInv, bool clamp)
 {
   // calculating the index of the model that the coefficients belong to
   IdxT modelIndex = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x);
@@ -169,7 +178,7 @@ __global__ void jones_transform_kernel(
     if (isInv)
       invtransform<DataT, IdxT, VALUE>(tmp, myNewParams, isAr);
     else
-      transform<DataT, IdxT, VALUE>(tmp, myNewParams, isAr);
+      transform<DataT, IdxT, VALUE>(tmp, myNewParams, isAr, clamp);
 
 // store
 #pragma unroll
@@ -193,6 +202,7 @@ __global__ void jones_transform_kernel(
  * @param isInv: set to true if the transformation is an inverse type transformation, false if
  * regular transform
  * @param stream: the cudaStream object
+ * @param clamp: whether to clamp transformed params between -1 and 1
  */
 template <typename DataT, typename IdxT = int>
 void jones_transform(const DataT* params,
@@ -201,7 +211,8 @@ void jones_transform(const DataT* params,
                      DataT* newParams,
                      bool isAr,
                      bool isInv,
-                     cudaStream_t stream)
+                     cudaStream_t stream,
+                     bool clamp = true)
 {
   ASSERT(batchSize >= 1 && parameter >= 1, "not defined!");
 
@@ -220,35 +231,43 @@ void jones_transform(const DataT* params,
   switch (parameter) {
     case 1:
       jones_transform_kernel<DataT, 1, IdxT, BLOCK_DIM_X, BLOCK_DIM_Y>
-        <<<numBlocks, numThreadsPerBlock, 0, stream>>>(newParams, params, batchSize, isAr, isInv);
+        <<<numBlocks, numThreadsPerBlock, 0, stream>>>(
+          newParams, params, batchSize, isAr, isInv, clamp);
       break;
     case 2:
       jones_transform_kernel<DataT, 2, IdxT, BLOCK_DIM_X, BLOCK_DIM_Y>
-        <<<numBlocks, numThreadsPerBlock, 0, stream>>>(newParams, params, batchSize, isAr, isInv);
+        <<<numBlocks, numThreadsPerBlock, 0, stream>>>(
+          newParams, params, batchSize, isAr, isInv, clamp);
       break;
     case 3:
       jones_transform_kernel<DataT, 3, IdxT, BLOCK_DIM_X, BLOCK_DIM_Y>
-        <<<numBlocks, numThreadsPerBlock, 0, stream>>>(newParams, params, batchSize, isAr, isInv);
+        <<<numBlocks, numThreadsPerBlock, 0, stream>>>(
+          newParams, params, batchSize, isAr, isInv, clamp);
       break;
     case 4:
       jones_transform_kernel<DataT, 4, IdxT, BLOCK_DIM_X, BLOCK_DIM_Y>
-        <<<numBlocks, numThreadsPerBlock, 0, stream>>>(newParams, params, batchSize, isAr, isInv);
+        <<<numBlocks, numThreadsPerBlock, 0, stream>>>(
+          newParams, params, batchSize, isAr, isInv, clamp);
       break;
     case 5:
       jones_transform_kernel<DataT, 5, IdxT, BLOCK_DIM_X, BLOCK_DIM_Y>
-        <<<numBlocks, numThreadsPerBlock, 0, stream>>>(newParams, params, batchSize, isAr, isInv);
+        <<<numBlocks, numThreadsPerBlock, 0, stream>>>(
+          newParams, params, batchSize, isAr, isInv, clamp);
       break;
     case 6:
       jones_transform_kernel<DataT, 6, IdxT, BLOCK_DIM_X, BLOCK_DIM_Y>
-        <<<numBlocks, numThreadsPerBlock, 0, stream>>>(newParams, params, batchSize, isAr, isInv);
+        <<<numBlocks, numThreadsPerBlock, 0, stream>>>(
+          newParams, params, batchSize, isAr, isInv, clamp);
       break;
     case 7:
       jones_transform_kernel<DataT, 7, IdxT, BLOCK_DIM_X, BLOCK_DIM_Y>
-        <<<numBlocks, numThreadsPerBlock, 0, stream>>>(newParams, params, batchSize, isAr, isInv);
+        <<<numBlocks, numThreadsPerBlock, 0, stream>>>(
+          newParams, params, batchSize, isAr, isInv, clamp);
       break;
     case 8:
       jones_transform_kernel<DataT, 8, IdxT, BLOCK_DIM_X, BLOCK_DIM_Y>
-        <<<numBlocks, numThreadsPerBlock, 0, stream>>>(newParams, params, batchSize, isAr, isInv);
+        <<<numBlocks, numThreadsPerBlock, 0, stream>>>(
+          newParams, params, batchSize, isAr, isInv, clamp);
       break;
     default: ASSERT(false, "Unsupported parameter '%d'!", parameter);
   }
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index a60e6080e6..6823d6d6e7 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -23,7 +23,7 @@ set(COMMON_TEST_LINK_LIBRARIES
   CUDA::cufft
   rmm::rmm
   raft::raft
-  FAISS::FAISS
+  $<$<BOOL:${LINK_FAISS}>:FAISS::FAISS>
   GTest::gtest
   GTest::gtest_main
   OpenMP::OpenMP_CXX
@@ -38,39 +38,46 @@ set(COMMON_TEST_LINK_LIBRARIES
 
 if(BUILD_CUML_TESTS)
   # (please keep the filenames in alphabetical order)
+
+  # Separable executable for each test targetted for 21.12, meanwhile
+  # we can just separate FIL here, full algorithm support in progress
   add_executable(${CUML_CPP_TEST_TARGET}
-    sg/cd_test.cu
-    sg/dbscan_test.cu
-    sg/fil_test.cu
-    sg/fnv_hash_test.cpp
-    sg/genetic/node_test.cpp
-    sg/genetic/param_test.cu
-    sg/handle_test.cu
-    sg/hdbscan_test.cu
-    sg/holtwinters_test.cu
-    sg/kmeans_test.cu
-    sg/knn_test.cu
-    sg/lars_test.cu
-    sg/linkage_test.cu
-    sg/logger.cpp
-    sg/multi_sum_test.cu
-    sg/nvtx_test.cpp
-    sg/ols.cu
-    sg/pca_test.cu
-    sg/quasi_newton.cu
-    sg/rf_test.cu
-    sg/rf_treelite_test.cu
-    sg/ridge.cu
-    sg/rproj_test.cu
-    sg/sgd.cu
-    sg/shap_kernel.cu
-    sg/svc_test.cu
-    sg/trustworthiness_test.cu
-    sg/tsne_test.cu
-    sg/tsvd_test.cu
-    sg/umap_parametrizable_test.cu
-    $<$<BOOL:BUILD_CUML_C_LIBRARY>:sg/handle_test.cu>
-  )
+    sg/fil_child_index_test.cu
+    sg/fil_test.cu)
+
+  if(CUML_CPP_ALGORITHMS STREQUAL "ALL")
+    target_sources(${CUML_CPP_TEST_TARGET}
+      PRIVATE
+        sg/cd_test.cu
+        sg/dbscan_test.cu
+        sg/fnv_hash_test.cpp
+        sg/genetic/node_test.cpp
+        sg/genetic/param_test.cu
+        sg/hdbscan_test.cu
+        sg/holtwinters_test.cu
+        sg/kmeans_test.cu
+        sg/knn_test.cu
+        sg/lars_test.cu
+        sg/linkage_test.cu
+        sg/logger.cpp
+        sg/multi_sum_test.cu
+        sg/nvtx_test.cpp
+        sg/ols.cu
+        sg/pca_test.cu
+        sg/quasi_newton.cu
+        sg/rf_test.cu
+        sg/ridge.cu
+        sg/rproj_test.cu
+        sg/sgd.cu
+        sg/shap_kernel.cu
+        sg/svc_test.cu
+        sg/trustworthiness_test.cu
+        sg/tsne_test.cu
+        sg/tsvd_test.cu
+        sg/umap_parametrizable_test.cu
+        $<$<BOOL:${BUILD_CUML_C_LIBRARY}>:sg/handle_test.cu>
+      )
+  endif()
 
   target_compile_options(${CUML_CPP_TEST_TARGET}
         PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUML_CXX_FLAGS}>"
@@ -87,8 +94,8 @@ if(BUILD_CUML_TESTS)
 
   target_link_libraries(${CUML_CPP_TEST_TARGET}
     PRIVATE
-      cuml
-        $<$<BOOL:BUILD_CUML_C_LIBRARY>:${CUML_C_TARGET}>
+        ${CUML_CPP_TARGET}
+        $<$<BOOL:${BUILD_CUML_C_LIBRARY}>:${CUML_C_TARGET}>
         ${COMMON_TEST_LINK_LIBRARIES}
     )
 
@@ -122,7 +129,7 @@ if(BUILD_CUML_MG_TESTS)
       )
 
     target_link_libraries(${CUML_MG_TEST_TARGET}
-      cuml
+      ${CUML_CPP_TARGET}
       ${COMMON_TEST_LINK_LIBRARIES}
       NCCL::NCCL
       ${MPI_CXX_LIBRARIES}
@@ -159,6 +166,7 @@ if(BUILD_PRIMS_TESTS)
     prims/entropy.cu
     prims/epsilon_neighborhood.cu
     prims/fast_int_div.cu
+    prims/fillna.cu
     prims/gather.cu
     prims/gram.cu
     prims/grid_sync.cu
@@ -213,7 +221,7 @@ if(BUILD_PRIMS_TESTS)
 
   target_link_libraries(${PRIMS_TEST_TARGET}
     PRIVATE
-      cuml
+      ${CUML_CPP_TARGET}
       ${COMMON_TEST_LINK_LIBRARIES}
   )
 
diff --git a/cpp/test/prims/fillna.cu b/cpp/test/prims/fillna.cu
new file mode 100644
index 0000000000..483d7d8b74
--- /dev/null
+++ b/cpp/test/prims/fillna.cu
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include <random>
+#include <vector>
+
+#include <raft/cudart_utils.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+#include <raft/mr/device/allocator.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include "test_utils.h"
+
+#include <timeSeries/fillna.cuh>
+
+namespace MLCommon {
+namespace TimeSeries {
+
+using namespace std;
+
+struct SeriesDescriptor {
+  int leading_nan;
+  int random_nan;
+  int trailing_nan;
+};
+
+template <typename T>
+struct FillnaInputs {
+  int batch_size;
+  int n_obs;
+  std::vector<SeriesDescriptor> descriptors;
+  unsigned long long int seed;
+  T tolerance;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const FillnaInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class FillnaTest : public ::testing::TestWithParam<FillnaInputs<T>> {
+ protected:
+  void basicTest()
+  {
+    raft::handle_t handle;
+
+    params = ::testing::TestWithParam<FillnaInputs<T>>::GetParam();
+
+    rmm::device_uvector<T> y(params.n_obs * params.batch_size, handle.get_stream());
+
+    std::vector<T> h_y(params.n_obs * params.batch_size);
+
+    /* Generate random data */
+    std::default_random_engine generator(params.seed);
+    std::uniform_real_distribution<T> real_distribution(-2.0, 2.0);
+    std::uniform_int_distribution<int> int_distribution(0, params.n_obs - 1);
+    for (int i = 0; i < params.n_obs * params.batch_size; i++)
+      h_y[i] = real_distribution(generator);
+    for (int bid = 0; bid < params.batch_size; bid++) {
+      for (int i = 0; i < params.descriptors[bid].leading_nan; i++)
+        h_y[bid * params.n_obs + i] = nan("");
+      for (int i = 0; i < params.descriptors[bid].trailing_nan; i++)
+        h_y[(bid + 1) * params.n_obs - 1 - i] = nan("");
+      for (int i = 0; i < params.descriptors[bid].random_nan; i++) {
+        h_y[bid * params.n_obs + int_distribution(generator)] = nan("");
+      }
+    }
+
+    /* Copy to device */
+    raft::update_device(
+      y.data(), h_y.data(), params.n_obs * params.batch_size, handle.get_stream());
+    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
+
+    /* Compute using tested prims */
+    fillna(y.data(), params.batch_size, params.n_obs, handle.get_stream());
+
+    /* Compute reference results.
+     * Note: this is done with a sliding window: we find ranges of missing
+     * values bordered by valid values at indices `start` and `end`.
+     * Special cases on extremities are also handled with the special values
+     * -1 for `start` and `n_obs` for `end`.
+     */
+    for (int bid = 0; bid < params.batch_size; bid++) {
+      int start = -1;
+      int end   = 0;
+      while (start < params.n_obs - 1) {
+        if (!std::isnan(h_y[bid * params.n_obs + start + 1])) {
+          start++;
+          end = start + 1;
+        } else if (end < params.n_obs && std::isnan(h_y[bid * params.n_obs + end])) {
+          end++;
+        } else {
+          if (start == -1) {
+            T value = h_y[bid * params.n_obs + end];
+            for (int j = 0; j < end; j++) {
+              h_y[bid * params.n_obs + j] = value;
+            }
+          } else if (end == params.n_obs) {
+            T value = h_y[bid * params.n_obs + start];
+            for (int j = start + 1; j < params.n_obs; j++) {
+              h_y[bid * params.n_obs + j] = value;
+            }
+          } else {
+            T value0 = h_y[bid * params.n_obs + start];
+            T value1 = h_y[bid * params.n_obs + end];
+            for (int j = start + 1; j < end; j++) {
+              T coef                      = (T)(j - start) / (T)(end - start);
+              h_y[bid * params.n_obs + j] = ((T)1 - coef) * value0 + coef * value1;
+            }
+          }
+          start = end;
+          end++;
+        }
+      }
+    }
+
+    /* Check results */
+    match = devArrMatchHost(h_y.data(),
+                            y.data(),
+                            params.n_obs * params.batch_size,
+                            raft::CompareApprox<T>(params.tolerance),
+                            handle.get_stream());
+  }
+
+  void SetUp() override { basicTest(); }
+
+  void TearDown() override {}
+
+ protected:
+  FillnaInputs<T> params;
+
+  testing::AssertionResult match = testing::AssertionFailure();
+};
+
+const std::vector<FillnaInputs<float>> inputsf = {
+  {1, 20, {{1, 5, 1}}, 12345U, 1e-6},
+  {3, 42, {{10, 0, 0}, {0, 10, 0}, {0, 0, 10}}, 12345U, 1e-6},
+  {4, 100, {{70, 0, 0}, {0, 20, 0}, {0, 0, 63}, {31, 25, 33}, {20, 15, 42}}, 12345U, 1e-6},
+};
+
+const std::vector<FillnaInputs<double>> inputsd = {
+  {1, 20, {{1, 5, 1}}, 12345U, 1e-6},
+  {3, 42, {{10, 0, 0}, {0, 10, 0}, {0, 0, 10}}, 12345U, 1e-6},
+  {4, 100, {{70, 0, 0}, {0, 20, 0}, {0, 0, 63}, {31, 25, 33}, {20, 15, 42}}, 12345U, 1e-6},
+};
+
+typedef FillnaTest<float> FillnaTestF;
+TEST_P(FillnaTestF, Result) { EXPECT_TRUE(match); }
+
+typedef FillnaTest<double> FillnaTestD;
+TEST_P(FillnaTestD, Result) { EXPECT_TRUE(match); }
+
+INSTANTIATE_TEST_CASE_P(FillnaTests, FillnaTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(FillnaTests, FillnaTestD, ::testing::ValuesIn(inputsd));
+
+}  // namespace TimeSeries
+}  // namespace MLCommon
\ No newline at end of file
diff --git a/cpp/test/prims/jones_transform.cu b/cpp/test/prims/jones_transform.cu
index bb3c670c05..4455c5b720 100644
--- a/cpp/test/prims/jones_transform.cu
+++ b/cpp/test/prims/jones_transform.cu
@@ -101,7 +101,7 @@ template
 
     // calling the ar_trans_param CUDA implementation
     MLCommon::TimeSeries::jones_transform(
-      d_params, params.batchSize, params.pValue, d_computed_ar_trans, true, false, stream);
+      d_params, params.batchSize, params.pValue, d_computed_ar_trans, true, false, stream, false);
 
     //>>>>>>>>> MA transform golden output generation<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
@@ -146,7 +146,7 @@ template
 
     // calling the ma_param_transform CUDA implementation
     MLCommon::TimeSeries::jones_transform(
-      d_params, params.batchSize, params.pValue, d_computed_ma_trans, false, false, stream);
+      d_params, params.batchSize, params.pValue, d_computed_ma_trans, false, false, stream, false);
 
     //>>>>>>>>> AR inverse transform <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
@@ -224,12 +224,12 @@ const std::vector<JonesTransParam> inputs = {{500, 4, 0.001},
 typedef JonesTransTest<double> JonesTransTestClass;
 TEST_P(JonesTransTestClass, Result)
 {
-  ASSERT_TRUE(raft::devArrMatch(d_computed_ar_trans,
-                                d_golden_ar_trans,
+  ASSERT_TRUE(raft::devArrMatch(d_golden_ar_trans,
+                                d_computed_ar_trans,
                                 nElements,
                                 raft::CompareApprox<double>(params.tolerance)));
-  ASSERT_TRUE(raft::devArrMatch(d_computed_ma_trans,
-                                d_golden_ma_trans,
+  ASSERT_TRUE(raft::devArrMatch(d_golden_ma_trans,
+                                d_computed_ma_trans,
                                 nElements,
                                 raft::CompareApprox<double>(params.tolerance)));
   /*
diff --git a/cpp/test/prims/linalg_block.cu b/cpp/test/prims/linalg_block.cu
index 6e2c31f358..1ae71159fe 100644
--- a/cpp/test/prims/linalg_block.cu
+++ b/cpp/test/prims/linalg_block.cu
@@ -749,5 +749,133 @@ INSTANTIATE_TEST_CASE_P(BlockAxTests, BlockAxTestF, ::testing::ValuesIn(ax_input
 
 INSTANTIATE_TEST_CASE_P(BlockAxTests, BlockAxTestD, ::testing::ValuesIn(ax_inputsd));
 
+/* Covariance stability */
+
+template <typename T>
+struct BlockCovStabilityInputs {
+  int n;
+  int batch_size;
+  T eps;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const BlockCovStabilityInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename CovPolicy, typename T>
+__global__ void block_cov_stability_test_kernel(int n, const T* in, T* out)
+{
+  __shared__ CovStabilityStorage<CovPolicy, T> cov_stability_storage;
+  _block_covariance_stability<CovPolicy>(
+    n, in + n * n * blockIdx.x, out + n * n * blockIdx.x, cov_stability_storage);
+}
+
+template <typename CovPolicy, typename T>
+class BlockCovStabilityTest : public ::testing::TestWithParam<BlockCovStabilityInputs<T>> {
+ protected:
+  void basicTest()
+  {
+    raft::handle_t handle;
+
+    params = ::testing::TestWithParam<BlockCovStabilityInputs<T>>::GetParam();
+
+    rmm::device_uvector<T> d_in(params.n * params.n * params.batch_size, handle.get_stream());
+    rmm::device_uvector<T> d_out(params.n * params.n * params.batch_size, handle.get_stream());
+
+    std::vector<T> h_in(params.n * params.n * params.batch_size);
+    std::vector<T> h_out(params.n * params.n * params.batch_size);
+
+    /* Generate random data on device */
+    raft::random::Rng r(params.seed);
+    r.uniform(
+      d_in.data(), params.n * params.n * params.batch_size, (T)-2, (T)2, handle.get_stream());
+
+    /* Copy to host */
+    raft::update_host(
+      h_in.data(), d_in.data(), params.n * params.n * params.batch_size, handle.get_stream());
+    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
+
+    /* Compute using tested prims */
+    block_cov_stability_test_kernel<CovPolicy>
+      <<<params.batch_size, CovPolicy::BlockSize, 0, handle.get_stream()>>>(
+        params.n, d_in.data(), d_out.data());
+
+    /* Compute reference results */
+    for (int bid = 0; bid < params.batch_size; bid++) {
+      for (int i = 0; i < params.n - 1; i++) {
+        for (int j = i + 1; j < params.n; j++) {
+          T val = 0.5 * (h_in[bid * params.n * params.n + j * params.n + i] +
+                         h_in[bid * params.n * params.n + i * params.n + j]);
+          h_out[bid * params.n * params.n + j * params.n + i] = val;
+          h_out[bid * params.n * params.n + i * params.n + j] = val;
+        }
+      }
+      for (int i = 0; i < params.n; i++) {
+        h_out[bid * params.n * params.n + i * params.n + i] =
+          abs(h_in[bid * params.n * params.n + i * params.n + i]);
+      }
+    }
+
+    /* Check results */
+    match = devArrMatchHost(h_out.data(),
+                            d_out.data(),
+                            params.n * params.n * params.batch_size,
+                            raft::CompareApprox<T>(params.eps),
+                            handle.get_stream());
+  }
+
+  void SetUp() override { basicTest(); }
+
+  void TearDown() override {}
+
+ protected:
+  BlockCovStabilityInputs<T> params;
+
+  testing::AssertionResult match = testing::AssertionFailure();
+};
+
+const std::vector<BlockCovStabilityInputs<float>> cs_inputsf = {
+  {15, 4, 1e-4, 12345U},
+  {33, 10, 1e-4, 12345U},
+  {220, 130, 1e-4, 12345U},
+};
+
+const std::vector<BlockCovStabilityInputs<double>> cs_inputsd = {
+  {15, 4, 1e-4, 12345U},
+  {33, 10, 1e-4, 12345U},
+  {220, 130, 1e-4, 12345U},
+};
+
+typedef BlockCovStabilityTest<BlockPolicy<1, 1, 8, 4>, float> BlockCovStabilityTestF_1_1_8_4;
+TEST_P(BlockCovStabilityTestF_1_1_8_4, Result) { EXPECT_TRUE(match); }
+
+typedef BlockCovStabilityTest<BlockPolicy<1, 1, 8, 4>, double> BlockCovStabilityTestD_1_1_8_4;
+TEST_P(BlockCovStabilityTestD_1_1_8_4, Result) { EXPECT_TRUE(match); }
+
+typedef BlockCovStabilityTest<BlockPolicy<1, 4, 32, 8>, float> BlockCovStabilityTestF_1_4_32_8;
+TEST_P(BlockCovStabilityTestF_1_4_32_8, Result) { EXPECT_TRUE(match); }
+
+typedef BlockCovStabilityTest<BlockPolicy<1, 4, 32, 8>, double> BlockCovStabilityTestD_1_4_32_8;
+TEST_P(BlockCovStabilityTestD_1_4_32_8, Result) { EXPECT_TRUE(match); }
+
+INSTANTIATE_TEST_CASE_P(BlockCovStabilityTests,
+                        BlockCovStabilityTestF_1_1_8_4,
+                        ::testing::ValuesIn(cs_inputsf));
+
+INSTANTIATE_TEST_CASE_P(BlockCovStabilityTests,
+                        BlockCovStabilityTestD_1_1_8_4,
+                        ::testing::ValuesIn(cs_inputsd));
+
+INSTANTIATE_TEST_CASE_P(BlockCovStabilityTests,
+                        BlockCovStabilityTestF_1_4_32_8,
+                        ::testing::ValuesIn(cs_inputsf));
+
+INSTANTIATE_TEST_CASE_P(BlockCovStabilityTests,
+                        BlockCovStabilityTestD_1_4_32_8,
+                        ::testing::ValuesIn(cs_inputsd));
+
 }  // namespace LinAlg
 }  // namespace MLCommon
\ No newline at end of file
diff --git a/cpp/test/prims/trustworthiness.cu b/cpp/test/prims/trustworthiness.cu
index 285f8f5301..432c9639f3 100644
--- a/cpp/test/prims/trustworthiness.cu
+++ b/cpp/test/prims/trustworthiness.cu
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <iostream>
 #include <metrics/trustworthiness_score.cuh>
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <vector>
 #include "test_utils.h"
 
diff --git a/cpp/test/sg/dbscan_test.cu b/cpp/test/sg/dbscan_test.cu
index 9b16ff76c5..4f88408612 100644
--- a/cpp/test/sg/dbscan_test.cu
+++ b/cpp/test/sg/dbscan_test.cu
@@ -23,7 +23,7 @@
 #include <cuml/cluster/dbscan.hpp>
 #include <cuml/datasets/make_blobs.hpp>
 #include <cuml/metrics/metrics.hpp>
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/transpose.h>
diff --git a/cpp/test/sg/decisiontree_batchedlevel_unittest.cu b/cpp/test/sg/decisiontree_batchedlevel_unittest.cu
index 9c402bec2c..37b9519b8c 100644
--- a/cpp/test/sg/decisiontree_batchedlevel_unittest.cu
+++ b/cpp/test/sg/decisiontree_batchedlevel_unittest.cu
@@ -279,7 +279,7 @@ TEST_P(TestMetric, RegressionMetricGain)
 
   CRITERION split_criterion = GetParam();
 
-  ObjectiveT obj(1, params.min_impurity_decrease, params.min_samples_leaf);
+  ObjectiveT obj(1, params.min_samples_leaf);
   size_t smemSize1 = n_bins * sizeof(ObjectiveT::BinT) +  // shist size
                      n_bins * sizeof(DataT) +             // sbins size
                      sizeof(int);                         // sDone size
diff --git a/cpp/test/sg/fil_child_index_test.cu b/cpp/test/sg/fil_child_index_test.cu
new file mode 100644
index 0000000000..fa963eff1f
--- /dev/null
+++ b/cpp/test/sg/fil_child_index_test.cu
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../src/fil/internal.cuh"
+
+#include <test_utils.h>
+
+#include <cuml/fil/fil.h>
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <cstdio>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <ostream>
+#include <utility>
+
+namespace ML {
+
+using namespace fil;
+
+struct proto_inner_node {
+  bool def_left       = false;  // default left, see base_node::def_left
+  bool is_categorical = false;  // see base_node::is_categorical
+  int fid             = 0;      // feature id, see base_node::fid
+  int set             = 0;      // which bit set represents the matching category list
+  float thresh        = 0.0f;   // threshold, see base_node::thresh
+  int left            = 1;      // left child idx, see sparse_node*::left_index()
+  val_t split()
+  {
+    val_t split;
+    if (is_categorical)
+      split.idx = set;
+    else
+      split.f = thresh;
+    return split;
+  }
+  operator sparse_node16()
+  {
+    return sparse_node16({}, split(), fid, def_left, false, is_categorical, left);
+  }
+  operator sparse_node8()
+  {
+    return sparse_node8({}, split(), fid, def_left, false, is_categorical, left);
+  }
+  operator dense_node() { return dense_node({}, split(), fid, def_left, false, is_categorical); }
+};
+
+std::ostream& operator<<(std::ostream& os, const proto_inner_node& node)
+{
+  os << "def_left " << node.def_left << " is_categorical " << node.is_categorical << " fid "
+     << node.fid << " set " << node.set << " thresh " << node.thresh << " left " << node.left;
+  return os;
+}
+
+/** mechanism to use named aggregate initialization before C++20, and also use
+    the struct defaults. Using it directly only works if all defaulted
+    members come after ones explicitly mentioned. C++ doesn't have reflection,
+    so any non-macro alternative would need a separate list of member accessors.
+**/
+// proto inner node
+#define NODE(...)                                               \
+  []() {                                                        \
+    struct NonDefaultProtoInnerNode : public proto_inner_node { \
+      NonDefaultProtoInnerNode() { __VA_ARGS__; }               \
+    };                                                          \
+    return proto_inner_node(NonDefaultProtoInnerNode());        \
+  }()
+
+// proto category sets for one node
+struct ProtoCategorySets {
+  // each bit set for each feature id is in a separate vector
+  // read each uint8_t from right to left, and the vector(s) - from left to right
+  std::vector<std::vector<uint8_t>> bits;
+  std::vector<int> max_matching;
+  operator cat_sets_owner()
+  {
+    ASSERT(
+      bits.size() == max_matching.size(),
+      "internal error: ProtoCategorySets::bits.size() != ProtoCategorySets::max_matching.size()");
+    std::vector<uint8_t> flat;
+    for (std::vector<uint8_t> v : bits) {
+      for (uint8_t b : v)
+        flat.push_back(b);
+    }
+    return {flat, max_matching};
+  }
+};
+
+struct ChildIndexTestParams {
+  proto_inner_node node;
+  int parent_node_idx = 0;
+  cat_sets_owner cso;
+  float input = 0.0f;
+  int correct = INT_MAX;
+};
+
+std::ostream& operator<<(std::ostream& os, const ChildIndexTestParams& ps)
+{
+  os << "node = {\n"
+     << ps.node << "\n} "
+     << "parent_node_idx = " << ps.parent_node_idx << " cat_sets_owner = {\n"
+     << ps.cso << "\n} input = " << ps.input << " correct = " << ps.correct;
+  return os;
+}
+
+/** mechanism to use named aggregate initialization before C++20, and also use
+    the struct defaults. Using it directly only works if all defaulted
+    members come after ones explicitly mentioned. C++ doesn't have reflection,
+    so any non-macro alternative would need a separate list of member accessors.
+**/
+#define CHILD_INDEX_TEST_PARAMS(...)                                      \
+  []() {                                                                  \
+    struct NonDefaultChildIndexTestParams : public ChildIndexTestParams { \
+      NonDefaultChildIndexTestParams() { __VA_ARGS__; }                   \
+    };                                                                    \
+    return ChildIndexTestParams(NonDefaultChildIndexTestParams());        \
+  }()
+
+template <typename fil_node_t>
+class ChildIndexTest : public testing::TestWithParam<ChildIndexTestParams> {
+ protected:
+  void check()
+  {
+    ChildIndexTestParams param = GetParam();
+    tree_base tree{param.cso.accessor()};
+    if (!std::is_same<fil_node_t, fil::dense_node>::value) {
+      // test that the logic uses node.left instead of parent_node_idx
+      param.node.left       = param.parent_node_idx * 2 + 1;
+      param.parent_node_idx = INT_MIN;
+    }
+    // nan -> !def_left, categorical -> if matches, numerical -> input >= threshold
+    int test_idx =
+      tree.child_index<true>((fil_node_t)param.node, param.parent_node_idx, param.input);
+    ASSERT(test_idx == param.correct,
+           "child index test: actual %d != correct %d",
+           test_idx,
+           param.correct);
+  }
+};
+
+typedef ChildIndexTest<fil::dense_node> ChildIndexTestDense;
+typedef ChildIndexTest<fil::sparse_node16> ChildIndexTestSparse16;
+typedef ChildIndexTest<fil::sparse_node8> ChildIndexTestSparse8;
+
+/* for dense nodes, left (false) == parent * 2 + 1, right (true) == parent * 2 + 2
+   E.g. see tree below:
+ 0 -> 1, 2
+ 1 -> 3, 4
+ 2 -> 5, 6
+ 3 -> 7, 8
+ 4 -> 9, 10
+ */
+const float INF = std::numeric_limits<float>::infinity();
+
+std::vector<ChildIndexTestParams> params = {
+  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0f), input = -INF, correct = 1),  // val !>= thresh
+  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0f), input = 0.0f, correct = 2),  // val >= thresh
+  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0f), input = +INF, correct = 2),  // val >= thresh
+  CHILD_INDEX_TEST_PARAMS(
+    node = NODE(thresh = 1.0f), input = -3.141592f, correct = 1),  // val !>= thresh
+  CHILD_INDEX_TEST_PARAMS(                                         // val >= thresh (e**pi > pi**e)
+    node    = NODE(thresh = 22.459158f),
+    input   = 23.140693f,
+    correct = 2),
+  CHILD_INDEX_TEST_PARAMS(  // val >= thresh for both negative
+    node    = NODE(thresh = -0.37f),
+    input   = -0.36f,
+    correct = 2),                                                                   // val >= thresh
+  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = -INF), input = 0.36f, correct = 2),  // val >= thresh
+  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0f), input = NAN, correct = 2),    // !def_left
+  CHILD_INDEX_TEST_PARAMS(node = NODE(def_left = true), input = NAN, correct = 1),  // !def_left
+  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = NAN), input = NAN, correct = 2),     // !def_left
+  CHILD_INDEX_TEST_PARAMS(
+    node = NODE(def_left = true, thresh = NAN), input = NAN, correct = 1),        // !def_left
+  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = NAN), input = 0.0f, correct = 1),  // val !>= thresh
+  CHILD_INDEX_TEST_PARAMS(
+    node = NODE(thresh = 0.0f), parent_node_idx = 1, input = -INF, correct = 3),
+  CHILD_INDEX_TEST_PARAMS(
+    node = NODE(thresh = 0.0f), parent_node_idx = 1, input = 0.0f, correct = 4),
+  CHILD_INDEX_TEST_PARAMS(
+    node = NODE(thresh = 0.0f), parent_node_idx = 2, input = -INF, correct = 5),
+  CHILD_INDEX_TEST_PARAMS(
+    node = NODE(thresh = 0.0f), parent_node_idx = 2, input = 0.0f, correct = 6),
+  CHILD_INDEX_TEST_PARAMS(
+    node = NODE(thresh = 0.0f), parent_node_idx = 3, input = -INF, correct = 7),
+  CHILD_INDEX_TEST_PARAMS(
+    node = NODE(thresh = 0.0f), parent_node_idx = 3, input = 0.0f, correct = 8),
+  CHILD_INDEX_TEST_PARAMS(
+    node = NODE(thresh = 0.0f), parent_node_idx = 4, input = -INF, correct = 9),
+  CHILD_INDEX_TEST_PARAMS(
+    node = NODE(thresh = 0.0f), parent_node_idx = 4, input = 0.0f, correct = 10),
+  CHILD_INDEX_TEST_PARAMS(parent_node_idx = 4, input = NAN, correct = 10),  // !def_left
+  CHILD_INDEX_TEST_PARAMS(
+    node = NODE(def_left = true), input = NAN, parent_node_idx = 4, correct = 9),  // !def_left
+  // cannot match ( > max_matching)
+  CHILD_INDEX_TEST_PARAMS(node             = NODE(is_categorical = true),
+                          cso.bits         = {},
+                          cso.max_matching = {-1},
+                          input            = 0,
+                          correct          = 1),
+  // does not match (bits[category] == 0, category == 0)
+  CHILD_INDEX_TEST_PARAMS(node             = NODE(is_categorical = true),
+                          cso.bits         = {0b0000'0000},
+                          cso.max_matching = {0},
+                          input            = 0,
+                          correct          = 1),
+  // matches
+  CHILD_INDEX_TEST_PARAMS(node             = NODE(is_categorical = true),
+                          cso.bits         = {0b0000'0001},
+                          cso.max_matching = {0},
+                          input            = 0,
+                          correct          = 2),
+  // matches
+  CHILD_INDEX_TEST_PARAMS(node             = NODE(is_categorical = true),
+                          cso.bits         = {0b0000'0101},
+                          cso.max_matching = {2, -1},
+                          input            = 2,
+                          correct          = 2),
+  // does not match (bits[category] == 0, category > 0)
+  CHILD_INDEX_TEST_PARAMS(node             = NODE(is_categorical = true),
+                          cso.bits         = {0b0000'0101},
+                          cso.max_matching = {2},
+                          input            = 1,
+                          correct          = 1),
+  // cannot match (max_matching[fid=1] == -1)
+  CHILD_INDEX_TEST_PARAMS(node             = NODE(is_categorical = true),
+                          node.fid         = 1,
+                          cso.bits         = {0b0000'0101},
+                          cso.max_matching = {2, -1},
+                          input            = 2,
+                          correct          = 1),
+};
+
+TEST_P(ChildIndexTestDense, Predict) { check(); }
+TEST_P(ChildIndexTestSparse16, Predict) { check(); }
+TEST_P(ChildIndexTestSparse8, Predict) { check(); }
+
+INSTANTIATE_TEST_CASE_P(FilTests, ChildIndexTestDense, testing::ValuesIn(params));
+INSTANTIATE_TEST_CASE_P(FilTests, ChildIndexTestSparse16, testing::ValuesIn(params));
+INSTANTIATE_TEST_CASE_P(FilTests, ChildIndexTestSparse8, testing::ValuesIn(params));
+}  // namespace ML
diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu
index 499092d1dd..293222667e 100644
--- a/cpp/test/sg/fil_test.cu
+++ b/cpp/test/sg/fil_test.cu
@@ -21,6 +21,10 @@
 #include <cuml/fil/fil.h>
 
 #include <raft/cudart_utils.h>
+#include <test_utils.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/random/rng.cuh>
 
@@ -55,6 +59,16 @@ struct FilTestParams {
   int depth       = 8;
   int num_trees   = 50;
   float leaf_prob = 0.05;
+  // below, categorical nodes means categorical inner nodes
+  // probability that a node is categorical (given that its feature is categorical)
+  float node_categorical_prob = 0.0f;
+  // probability that a feature is categorical (pertains to data generation, can
+  // still be interpreted as numerical by a node)
+  float feature_categorical_prob = 0.0f;
+  // during model creation, how often categories < max_matching are marked as matching?
+  float cat_match_prob = 0.5f;
+  // Order Of Magnitude for maximum matching category for categorical nodes
+  float max_magnitude_of_matching_cat = 1.0f;
   // output parameters
   output_t output   = output_t::RAW;
   float threshold   = 0.0f;
@@ -109,7 +123,11 @@ std::ostream& operator<<(std::ostream& os, const FilTestParams& ps)
      << ", blocks_per_sm = " << ps.blocks_per_sm << ", algo = " << ps.algo << ", seed = " << ps.seed
      << ", tolerance = " << ps.tolerance << ", op = " << tl::OpName(ps.op)
      << ", global_bias = " << ps.global_bias << ", leaf_algo = " << ps.leaf_algo
-     << ", num_classes = " << ps.num_classes;
+     << ", num_classes = " << ps.num_classes
+     << ", node_categorical_prob = " << ps.node_categorical_prob
+     << ", feature_categorical_prob = " << ps.feature_categorical_prob
+     << ", cat_match_prob = " << ps.cat_match_prob
+     << ", max_magnitude_of_matching_cat = " << ps.max_magnitude_of_matching_cat;
   return os;
 }
 
@@ -122,6 +140,74 @@ __global__ void nan_kernel(float* data, const bool* mask, int len, float nan)
 
 float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); }
 
+void hard_clipped_bernoulli(
+  raft::random::Rng rng, float* d, std::size_t n_vals, float prob_of_zero, cudaStream_t stream)
+{
+  rng.uniform(d, n_vals, 0.0f, 1.0f, stream);
+  thrust::transform(
+    thrust::cuda::par.on(stream), d, d + n_vals, d, [=] __device__(float uniform_0_1) -> float {
+      // if prob_of_zero == 0.0f, we should never generate a zero
+      if (prob_of_zero == 0.0f) return 1.0f;
+      float truly_0_1 = fmax(fmin(uniform_0_1, 1.0f), 0.0f);
+      // if prob_of_zero == 1.0f, we should never generate a one, hence ">"
+      return truly_0_1 > prob_of_zero ? 1.0f : 0.0f;
+    });
+}
+
+struct replace_some_floating_with_categorical {
+  int* max_matching_cat_d;
+  int num_cols;
+  __device__ float operator()(float data, int data_idx)
+  {
+    int max_matching_cat = max_matching_cat_d[data_idx % num_cols];
+    if (max_matching_cat == -1) return data;
+    return roundf((data * 0.5f + 0.5f) * max_matching_cat);
+  }
+};
+
+__global__ void floats_to_bit_stream_k(uint8_t* dst, float* src, std::size_t size)
+{
+  std::size_t idx = std::size_t(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (idx >= size) return;
+  int byte = 0;
+#pragma unroll
+  for (int i = 0; i < BITS_PER_BYTE; ++i)
+    byte |= (int)src[idx * BITS_PER_BYTE + i] << i;
+  dst[idx] = byte;
+}
+
+void adjust_threshold_to_treelite(
+  float* pthreshold, int* tl_left, int* tl_right, bool* default_left, tl::Operator comparison_op)
+{
+  // in treelite (take left node if val [op] threshold),
+  // the meaning of the condition is reversed compared to FIL;
+  // thus, "<" in treelite corresonds to comparison ">=" used by FIL
+  // https://github.com/dmlc/treelite/blob/master/include/treelite/tree.h#L243
+  // TODO(levsnv): remove workaround once confirmed to work with empty category lists in Treelite
+  if (isnan(*pthreshold)) {
+    std::swap(*tl_left, *tl_right);
+    *default_left = !*default_left;
+    return;
+  }
+  switch (comparison_op) {
+    case tl::Operator::kLT: break;
+    case tl::Operator::kLE:
+      // x <= y is equivalent to x < y', where y' is the next representable float
+      *pthreshold = std::nextafterf(*pthreshold, -std::numeric_limits<float>::infinity());
+      break;
+    case tl::Operator::kGT:
+      // x > y is equivalent to x >= y', where y' is the next representable float
+      // left and right still need to be swapped
+      *pthreshold = std::nextafterf(*pthreshold, -std::numeric_limits<float>::infinity());
+    case tl::Operator::kGE:
+      // swap left and right
+      std::swap(*tl_left, *tl_right);
+      *default_left = !*default_left;
+      break;
+    default: ASSERT(false, "only <, >, <= and >= comparisons are supported");
+  }
+}
+
 class BaseFilTest : public testing::TestWithParam<FilTestParams> {
  protected:
   void setup_helper()
@@ -156,19 +242,20 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
     /// weights, used as float* or int*
     int* weights_d      = nullptr;
     float* thresholds_d = nullptr;
-    int* fids_d         = nullptr;
     bool* def_lefts_d   = nullptr;
     bool* is_leafs_d    = nullptr;
     bool* def_lefts_h   = nullptr;
     bool* is_leafs_h    = nullptr;
+    rmm::device_uvector<float> is_categoricals_d(num_nodes, stream);
 
     // allocate GPU data
     raft::allocate(weights_d, num_nodes, stream);
     // sizeof(float) == sizeof(int)
     raft::allocate(thresholds_d, num_nodes, stream);
-    raft::allocate(fids_d, num_nodes, stream);
     raft::allocate(def_lefts_d, num_nodes, stream);
     raft::allocate(is_leafs_d, num_nodes, stream);
+    fids_d.resize(num_nodes, stream);
+    max_matching_cat_d.resize(ps.num_cols, stream);
 
     // generate on-GPU random data
     raft::random::Rng r(ps.seed);
@@ -193,21 +280,47 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
       r.uniform((float*)weights_d, num_nodes, -1.0f, 1.0f, stream);
     }
     r.uniform(thresholds_d, num_nodes, -1.0f, 1.0f, stream);
-    r.uniformInt(fids_d, num_nodes, 0, ps.num_cols, stream);
+    r.uniformInt(fids_d.data(), num_nodes, 0, ps.num_cols, stream);
     r.bernoulli(def_lefts_d, num_nodes, 0.5f, stream);
     r.bernoulli(is_leafs_d, num_nodes, 1.0f - ps.leaf_prob, stream);
+    hard_clipped_bernoulli(
+      r, is_categoricals_d.data(), num_nodes, 1.0f - ps.node_categorical_prob, stream);
 
     // copy data to host
-    std::vector<float> thresholds_h(num_nodes);
-    std::vector<int> weights_h(num_nodes), fids_h(num_nodes);
+    std::vector<float> thresholds_h(num_nodes), is_categoricals_h(num_nodes);
+    std::vector<int> max_matching_cat_h(ps.num_cols), weights_h(num_nodes), fids_h(num_nodes),
+      node_cat_set(num_nodes);
+    std::vector<bool> feature_categorical(ps.num_cols);
+    // bool vectors are not guaranteed to be stored byte-per-value
     def_lefts_h = new bool[num_nodes];
     is_leafs_h  = new bool[num_nodes];
 
+    // uniformily distributed in orders of magnitude: smaller models which
+    // still stress large bitfields.
+    // up to 10**ps.max_magnitude_of_matching_cat (only if feature is categorical, else -1)
+    std::mt19937 gen(ps.seed);
+    std::uniform_real_distribution mmc(-1.0f, ps.max_magnitude_of_matching_cat);
+    std::bernoulli_distribution fc(ps.feature_categorical_prob);
+    cat_sets_h.max_matching.resize(ps.num_cols);
+    for (int fid = 0; fid < ps.num_cols; ++fid) {
+      feature_categorical[fid] = fc(gen);
+      if (feature_categorical[fid]) {
+        // even for some categorical features, we will have no matching categories
+        float mm = pow(10, mmc(gen)) - 1.0f;
+        ASSERT(mm < INT_MAX,
+               "internal error: max_magnitude_of_matching_cat %f is too large",
+               ps.max_magnitude_of_matching_cat);
+        cat_sets_h.max_matching[fid] = mm;
+      } else {
+        cat_sets_h.max_matching[fid] = -1;
+      }
+    }
     raft::update_host(weights_h.data(), (int*)weights_d, num_nodes, stream);
     raft::update_host(thresholds_h.data(), thresholds_d, num_nodes, stream);
-    raft::update_host(fids_h.data(), fids_d, num_nodes, stream);
+    raft::update_host(fids_h.data(), fids_d.data(), num_nodes, stream);
     raft::update_host(def_lefts_h, def_lefts_d, num_nodes, stream);
     raft::update_host(is_leafs_h, is_leafs_d, num_nodes, stream);
+    raft::update_host(is_categoricals_h.data(), is_categoricals_d.data(), num_nodes, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
     // mark leaves
@@ -220,6 +333,44 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
       }
     }
 
+    // count nodes for each feature id, while splitting the sets between nodes
+    std::size_t bit_pool_size = 0;
+    cat_sets_h.n_nodes        = std::vector<std::size_t>(ps.num_cols, 0);
+    for (std::size_t node_id = 0; node_id < num_nodes; ++node_id) {
+      int fid = fids_h[node_id];
+
+      if (!feature_categorical[fid] || is_leafs_h[node_id]) is_categoricals_h[node_id] = 0.0f;
+
+      if (is_categoricals_h[node_id] == 1.0) {
+        // might allocate a categorical set for an unreachable inner node. That's OK.
+        ++cat_sets_h.n_nodes[fid];
+        node_cat_set[node_id] = bit_pool_size;
+        bit_pool_size += cat_sets_h.accessor().sizeof_mask(fid);
+      }
+    }
+    cat_sets_h.bits.resize(bit_pool_size);
+    raft::update_device(
+      max_matching_cat_d.data(), cat_sets_h.max_matching.data(), ps.num_cols, stream);
+    // calculate sizes and allocate arrays for category sets
+    // fill category sets
+    // there is a faster trick with a 256-byte LUT, but we can implement it later if the tests
+    // become too slow
+    rmm::device_uvector<float> bits_precursor_d(cat_sets_h.bits.size() * BITS_PER_BYTE, stream);
+    rmm::device_uvector<uint8_t> bits_d(cat_sets_h.bits.size(), stream);
+    if (cat_sets_h.bits.size() != 0) {
+      hard_clipped_bernoulli(r,
+                             bits_precursor_d.data(),
+                             cat_sets_h.bits.size() * BITS_PER_BYTE,
+                             1.0f - ps.cat_match_prob,
+                             stream);
+      floats_to_bit_stream_k<<<raft::ceildiv(cat_sets_h.bits.size(), (std::size_t)FIL_TPB),
+                               FIL_TPB,
+                               0,
+                               stream>>>(
+        bits_d.data(), bits_precursor_d.data(), cat_sets_h.bits.size());
+      raft::update_host(cat_sets_h.bits.data(), bits_d.data(), cat_sets_h.bits.size(), stream);
+    }
+
     // initialize nodes
     nodes.resize(num_nodes);
     for (size_t i = 0; i < num_nodes; ++i) {
@@ -235,7 +386,15 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
         case fil::leaf_algo_t::VECTOR_LEAF: w.idx = i; break;
         default: ASSERT(false, "internal error: invalid ps.leaf_algo");
       }
-      nodes[i] = fil::dense_node(w, thresholds_h[i], fids_h[i], def_lefts_h[i], is_leafs_h[i]);
+      // make sure nodes are categorical only when their feature ID is categorical
+      bool is_categorical = is_categoricals_h[i] == 1.0f;
+      val_t split;
+      if (is_categorical)
+        split.idx = node_cat_set[i];
+      else
+        split.f = thresholds_h[i];
+      nodes[i] =
+        fil::dense_node(w, split, fids_h[i], def_lefts_h[i], is_leafs_h[i], is_categorical);
     }
 
     // clean up
@@ -243,9 +402,9 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
     delete[] is_leafs_h;
     CUDA_CHECK(cudaFree(is_leafs_d));
     CUDA_CHECK(cudaFree(def_lefts_d));
-    CUDA_CHECK(cudaFree(fids_d));
     CUDA_CHECK(cudaFree(thresholds_d));
     CUDA_CHECK(cudaFree(weights_d));
+    // cat_sets_h.bits and max_matching_cat_d are now visible to host
   }
 
   void generate_data()
@@ -259,6 +418,13 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
     // generate random data
     raft::random::Rng r(ps.seed);
     r.uniform(data_d, num_data, -1.0f, 1.0f, stream);
+    thrust::transform(
+      thrust::cuda::par.on(stream),
+      data_d,
+      data_d + num_data,
+      thrust::counting_iterator(0),
+      data_d,
+      replace_some_floating_with_categorical{max_matching_cat_d.data(), ps.num_cols});
     r.bernoulli(mask_d, num_data, ps.nan_prob, stream);
     int tpb = 256;
     nan_kernel<<<raft::ceildiv(int(num_data), tpb), tpb, 0, stream>>>(
@@ -306,15 +472,17 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
   {
     // predict on host
     std::vector<float> want_preds_h(ps.num_preds_outputs());
-    std::vector<float> want_proba_h(ps.num_proba_outputs());
+    want_proba_h.resize(ps.num_proba_outputs());
     int num_nodes = tree_num_nodes();
     std::vector<float> class_scores(ps.num_classes);
+    // we use tree_base::child_index() on CPU
+    tree_base base{cat_sets_h.accessor()};
     switch (ps.leaf_algo) {
       case fil::leaf_algo_t::FLOAT_UNARY_BINARY:
         for (int i = 0; i < ps.num_rows; ++i) {
           float pred = 0.0f;
           for (int j = 0; j < ps.num_trees; ++j) {
-            pred += infer_one_tree(&nodes[j * num_nodes], &data_h[i * ps.num_cols]).f;
+            pred += infer_one_tree(&nodes[j * num_nodes], &data_h[i * ps.num_cols], base).f;
           }
           transform(pred, want_proba_h[i * 2 + 1], want_preds_h[i]);
           complement(&(want_proba_h[i * 2]));
@@ -325,7 +493,7 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
           std::fill(class_scores.begin(), class_scores.end(), 0.0f);
           for (int tree = 0; tree < ps.num_trees; ++tree) {
             class_scores[tree % ps.num_classes] +=
-              infer_one_tree(&nodes[tree * num_nodes], &data_h[row * ps.num_cols]).f;
+              infer_one_tree(&nodes[tree * num_nodes], &data_h[row * ps.num_cols], base).f;
           }
           want_preds_h[row] =
             std::max_element(class_scores.begin(), class_scores.end()) - class_scores.begin();
@@ -342,7 +510,8 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
         for (int r = 0; r < ps.num_rows; ++r) {
           std::fill(class_votes.begin(), class_votes.end(), 0);
           for (int j = 0; j < ps.num_trees; ++j) {
-            int class_label = infer_one_tree(&nodes[j * num_nodes], &data_h[r * ps.num_cols]).idx;
+            int class_label =
+              infer_one_tree(&nodes[j * num_nodes], &data_h[r * ps.num_cols], base).idx;
             ++class_votes[class_label];
           }
           for (int c = 0; c < ps.num_classes; ++c) {
@@ -358,8 +527,9 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
         for (int r = 0; r < ps.num_rows; ++r) {
           std::vector<float> class_probabilities(ps.num_classes);
           for (int j = 0; j < ps.num_trees; ++j) {
-            int vector_index = infer_one_tree(&nodes[j * num_nodes], &data_h[r * ps.num_cols]).idx;
-            float sum        = 0.0;
+            int vector_index =
+              infer_one_tree(&nodes[j * num_nodes], &data_h[r * ps.num_cols], base).idx;
+            float sum = 0.0;
             for (int k = 0; k < ps.num_classes; k++) {
               class_probabilities[k] += vector_leaf[vector_index * ps.num_classes + k];
               sum += vector_leaf[vector_index * ps.num_classes + k];
@@ -421,16 +591,15 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
       want_preds_d, preds_d, ps.num_rows, raft::CompareApprox<float>(tolerance), stream));
   }
 
-  fil::val_t infer_one_tree(fil::dense_node* root, float* data)
+  fil::val_t infer_one_tree(fil::dense_node* root, float* data, const tree_base& tree)
   {
     int curr = 0;
     fil::val_t output{.f = 0.0f};
     for (;;) {
       const fil::dense_node& node = root[curr];
-      if (node.is_leaf()) return node.base_node::output<val_t>();
+      if (node.is_leaf()) return node.template output<val_t>();
       float val = data[node.fid()];
-      bool cond = isnan(val) ? !node.def_left() : val >= node.thresh();
-      curr      = (curr << 1) + 1 + (cond ? 1 : 0);
+      curr      = tree.child_index<true>(node, curr, val);
     }
     return output;
   }
@@ -448,10 +617,14 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
   // input data
   float* data_d = nullptr;
   std::vector<float> data_h;
+  std::vector<float> want_proba_h;
 
   // forest data
   std::vector<fil::dense_node> nodes;
   std::vector<float> vector_leaf;
+  cat_sets_owner cat_sets_h;
+  rmm::device_uvector<int> fids_d             = rmm::device_uvector<int>(0, cudaStream_t());
+  rmm::device_uvector<int> max_matching_cat_d = rmm::device_uvector<int>(0, cudaStream_t());
 
   // parameters
   cudaStream_t stream = 0;
@@ -478,7 +651,7 @@ class PredictDenseFilTest : public BaseFilTest {
     fil_ps.threads_per_tree = ps.threads_per_tree;
     fil_ps.n_items          = ps.n_items;
 
-    fil::init_dense(handle, pforest, nodes.data(), &fil_ps, vector_leaf);
+    fil::init_dense(handle, pforest, cat_sets_h.accessor(), vector_leaf, nodes.data(), &fil_ps);
   }
 };
 
@@ -493,12 +666,8 @@ class BasePredictSparseFilTest : public BaseFilTest {
     const fil::dense_node& node = dense_root[i_dense];
     if (node.is_leaf()) {
       // leaf sparse node
-      sparse_nodes[i_sparse] = fil_node_t(node.base_node::output<val_t>(),
-                                          node.thresh(),
-                                          node.fid(),
-                                          node.def_left(),
-                                          node.is_leaf(),
-                                          0);
+      sparse_nodes[i_sparse] =
+        fil_node_t(node.output<val_t>(), {}, node.fid(), node.def_left(), node.is_leaf(), false, 0);
       return;
     }
     // inner sparse node
@@ -506,11 +675,12 @@ class BasePredictSparseFilTest : public BaseFilTest {
     int left_index = sparse_nodes.size();
     sparse_nodes.push_back(fil_node_t());
     sparse_nodes.push_back(fil_node_t());
-    sparse_nodes[i_sparse] = fil_node_t(node.base_node::output<val_t>(),
-                                        node.thresh(),
+    sparse_nodes[i_sparse] = fil_node_t({},
+                                        node.split(),
                                         node.fid(),
                                         node.def_left(),
                                         node.is_leaf(),
+                                        node.is_categorical(),
                                         left_index - i_sparse_root);
     dense2sparse_node(dense_root, 2 * i_dense + 1, i_sparse_root, left_index);
     dense2sparse_node(dense_root, 2 * i_dense + 2, i_sparse_root, left_index + 1);
@@ -549,7 +719,13 @@ class BasePredictSparseFilTest : public BaseFilTest {
 
     dense2sparse();
     fil_params.num_nodes = sparse_nodes.size();
-    fil::init_sparse(handle, pforest, trees.data(), sparse_nodes.data(), &fil_params, vector_leaf);
+    fil::init_sparse(handle,
+                     pforest,
+                     cat_sets_h.accessor(),
+                     vector_leaf,
+                     trees.data(),
+                     sparse_nodes.data(),
+                     &fil_params);
   }
   std::vector<fil_node_t> sparse_nodes;
   std::vector<int> trees;
@@ -568,17 +744,18 @@ class TreeliteFilTest : public BaseFilTest {
     int key = (*pkey)++;
     builder->CreateNode(key);
     const fil::dense_node& dense_node = nodes[node];
+    std::vector<std::uint32_t> left_categories;
     if (dense_node.is_leaf()) {
       switch (ps.leaf_algo) {
         case fil::leaf_algo_t::FLOAT_UNARY_BINARY:
         case fil::leaf_algo_t::GROVE_PER_CLASS:
           // default is fil::FLOAT_UNARY_BINARY
-          builder->SetLeafNode(key, tlf::Value::Create(dense_node.base_node::output<val_t>().f));
+          builder->SetLeafNode(key, tlf::Value::Create(dense_node.output<float>()));
           break;
         case fil::leaf_algo_t::CATEGORICAL_LEAF: {
           std::vector<tlf::Value> vec(ps.num_classes);
           for (int i = 0; i < ps.num_classes; ++i) {
-            vec[i] = tlf::Value::Create(i == dense_node.template output<val_t>().idx ? 1.0f : 0.0f);
+            vec[i] = tlf::Value::Create(i == dense_node.output<int>() ? 1.0f : 0.0f);
           }
           builder->SetLeafVectorNode(key, vec);
           break;
@@ -586,7 +763,7 @@ class TreeliteFilTest : public BaseFilTest {
         case fil::leaf_algo_t::VECTOR_LEAF: {
           std::vector<tlf::Value> vec(ps.num_classes);
           for (int i = 0; i < ps.num_classes; ++i) {
-            auto idx = dense_node.template output<val_t>().idx;
+            auto idx = dense_node.output<int>();
             vec[i]   = tlf::Value::Create(vector_leaf[idx * ps.num_classes + i]);
           }
           builder->SetLeafVectorNode(key, vec);
@@ -598,33 +775,42 @@ class TreeliteFilTest : public BaseFilTest {
     } else {
       int left          = root + 2 * (node - root) + 1;
       int right         = root + 2 * (node - root) + 2;
-      float threshold   = dense_node.thresh();
       bool default_left = dense_node.def_left();
-      switch (ps.op) {
-        case tl::Operator::kLT: break;
-        case tl::Operator::kLE:
-          // adjust the threshold
-          threshold = std::nextafterf(threshold, -std::numeric_limits<float>::infinity());
-          break;
-        case tl::Operator::kGT:
-          // adjust the threshold; left and right still need to be swapped
-          threshold = std::nextafterf(threshold, -std::numeric_limits<float>::infinity());
-        case tl::Operator::kGE:
-          // swap left and right
-          std::swap(left, right);
-          default_left = !default_left;
-          break;
-        default: ASSERT(false, "comparison operator must be <, >, <= or >=");
+      float threshold   = dense_node.is_categorical() ? NAN : dense_node.thresh();
+      if (dense_node.is_categorical()) {
+        uint8_t byte = 0;
+        for (int category = 0; category <= cat_sets_h.max_matching[dense_node.fid()]; ++category) {
+          if (category % BITS_PER_BYTE == 0) {
+            byte = cat_sets_h.bits[dense_node.set() + category / BITS_PER_BYTE];
+          }
+          if ((byte & (1 << (category % BITS_PER_BYTE))) != 0) {
+            left_categories.push_back(category);
+          }
+        }
       }
       int left_key  = node_to_treelite(builder, pkey, root, left);
       int right_key = node_to_treelite(builder, pkey, root, right);
-      builder->SetNumericalTestNode(key,
-                                    dense_node.fid(),
-                                    ps.op,
-                                    tlf::Value::Create(threshold),
-                                    default_left,
-                                    left_key,
-                                    right_key);
+      // TODO(levsnv): remove workaround once confirmed to work with empty category lists in
+      // Treelite
+      if (!left_categories.empty() && dense_node.is_categorical()) {
+        // Treelite builder APIs don't allow to set categorical_split_right_child
+        // (which child the categories pertain to). Only the Tree API allows that.
+        // in FIL, categories always pertain to the right child, and the default in treelite
+        // is left categories in SetCategoricalTestNode
+        std::swap(left_key, right_key);
+        default_left = !default_left;
+        builder->SetCategoricalTestNode(
+          key, dense_node.fid(), left_categories, default_left, left_key, right_key);
+      } else {
+        adjust_threshold_to_treelite(&threshold, &left_key, &right_key, &default_left, ps.op);
+        builder->SetNumericalTestNode(key,
+                                      dense_node.fid(),
+                                      ps.op,
+                                      tlf::Value::Create(threshold),
+                                      default_left,
+                                      left_key,
+                                      right_key);
+      }
     }
     return key;
   }
@@ -828,7 +1014,11 @@ std::vector<FilTestParams> predict_dense_inputs = {
                   output      = AVG_SOFTMAX,
                   leaf_algo   = GROVE_PER_CLASS,
                   num_classes = FIL_TPB + 1),
-  FIL_TEST_PARAMS(num_cols = 100'000, depth = 5, num_trees = 1, leaf_algo = FLOAT_UNARY_BINARY),
+  FIL_TEST_PARAMS(num_rows  = 10'000,
+                  num_cols  = 100'000,
+                  depth     = 5,
+                  num_trees = 1,
+                  leaf_algo = FLOAT_UNARY_BINARY),
   FIL_TEST_PARAMS(num_rows    = 101,
                   num_cols    = 100'000,
                   depth       = 5,
@@ -878,6 +1068,15 @@ std::vector<FilTestParams> predict_dense_inputs = {
                   num_trees   = 3,
                   leaf_algo   = VECTOR_LEAF,
                   num_classes = 4000),
+  FIL_TEST_PARAMS(node_categorical_prob = 0.5, feature_categorical_prob = 0.5),
+  FIL_TEST_PARAMS(
+    node_categorical_prob = 1.0, feature_categorical_prob = 1.0, cat_match_prob = 1.0),
+  FIL_TEST_PARAMS(
+    node_categorical_prob = 1.0, feature_categorical_prob = 1.0, cat_match_prob = 0.0),
+  FIL_TEST_PARAMS(depth                         = 3,
+                  node_categorical_prob         = 0.5,
+                  feature_categorical_prob      = 0.5,
+                  max_magnitude_of_matching_cat = 5),
 };
 
 TEST_P(PredictDenseFilTest, Predict) { compare(); }
@@ -947,6 +1146,15 @@ std::vector<FilTestParams> predict_sparse_inputs = {
                   num_trees   = 530,
                   leaf_algo   = VECTOR_LEAF,
                   num_classes = 1111),
+  FIL_TEST_PARAMS(node_categorical_prob = 0.5, feature_categorical_prob = 0.5),
+  FIL_TEST_PARAMS(
+    node_categorical_prob = 1.0, feature_categorical_prob = 1.0, cat_match_prob = 1.0),
+  FIL_TEST_PARAMS(
+    node_categorical_prob = 1.0, feature_categorical_prob = 1.0, cat_match_prob = 0.0),
+  FIL_TEST_PARAMS(depth                         = 3,
+                  node_categorical_prob         = 0.5,
+                  feature_categorical_prob      = 0.5,
+                  max_magnitude_of_matching_cat = 5),
 };
 
 TEST_P(PredictSparse16FilTest, Predict) { compare(); }
@@ -1041,6 +1249,15 @@ std::vector<FilTestParams> import_dense_inputs = {
   FIL_TEST_PARAMS(print_forest_shape = true),
   FIL_TEST_PARAMS(leaf_algo = VECTOR_LEAF, num_classes = 2),
   FIL_TEST_PARAMS(leaf_algo = VECTOR_LEAF, num_trees = 19, num_classes = 20),
+  FIL_TEST_PARAMS(node_categorical_prob = 0.5, feature_categorical_prob = 0.5),
+  FIL_TEST_PARAMS(
+    node_categorical_prob = 1.0, feature_categorical_prob = 1.0, cat_match_prob = 1.0),
+  FIL_TEST_PARAMS(
+    node_categorical_prob = 1.0, feature_categorical_prob = 1.0, cat_match_prob = 0.0),
+  FIL_TEST_PARAMS(depth                         = 3,
+                  node_categorical_prob         = 0.5,
+                  feature_categorical_prob      = 0.5,
+                  max_magnitude_of_matching_cat = 5),
 };
 
 TEST_P(TreeliteDenseFilTest, Import) { compare(); }
@@ -1085,6 +1302,15 @@ std::vector<FilTestParams> import_sparse_inputs = {
                   num_classes = 3),
   FIL_TEST_PARAMS(leaf_algo = VECTOR_LEAF, num_classes = 2),
   FIL_TEST_PARAMS(leaf_algo = VECTOR_LEAF, num_trees = 19, num_classes = 20),
+  FIL_TEST_PARAMS(node_categorical_prob = 0.5, feature_categorical_prob = 0.5),
+  FIL_TEST_PARAMS(
+    node_categorical_prob = 1.0, feature_categorical_prob = 1.0, cat_match_prob = 1.0),
+  FIL_TEST_PARAMS(
+    node_categorical_prob = 1.0, feature_categorical_prob = 1.0, cat_match_prob = 0.0),
+  FIL_TEST_PARAMS(depth                         = 3,
+                  node_categorical_prob         = 0.5,
+                  feature_categorical_prob      = 0.5,
+                  max_magnitude_of_matching_cat = 5),
 };
 
 TEST_P(TreeliteSparse16FilTest, Import) { compare(); }
diff --git a/cpp/test/sg/hdbscan_test.cu b/cpp/test/sg/hdbscan_test.cu
index a9299cb1d7..11e65e8553 100644
--- a/cpp/test/sg/hdbscan_test.cu
+++ b/cpp/test/sg/hdbscan_test.cu
@@ -116,7 +116,6 @@ class HDBSCANTest : public ::testing::TestWithParam<HDBSCANInputs<T, IdxT>> {
  protected:
   HDBSCANInputs<T, IdxT> params;
   IdxT* labels_ref;
-  int k;
 
   double score;
 };
diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu
index 9871911a38..59aa2c29d3 100644
--- a/cpp/test/sg/rf_test.cu
+++ b/cpp/test/sg/rf_test.cu
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include <test_utils.h>
+#include <cuml/common/logger.hpp>
 
 #include <decisiontree/batched-levelalgo/kernels.cuh>
 #include <decisiontree/batched-levelalgo/quantile.cuh>
@@ -162,7 +162,7 @@ auto FilPredict(const raft::handle_t& handle,
   ModelHandle model;
   std::size_t num_outputs = 1;
   if constexpr (std::is_integral_v<LabelT>) { num_outputs = params.n_labels; }
-  build_treelite_forest(&model, forest, params.n_cols, num_outputs);
+  build_treelite_forest(&model, forest, params.n_cols);
   fil::treelite_params_t tl_params{fil::algo_t::ALGO_AUTO,
                                    num_outputs > 1,
                                    1.f / num_outputs,
@@ -177,6 +177,24 @@ auto FilPredict(const raft::handle_t& handle,
   return pred;
 }
 
+template <typename DataT, typename LabelT>
+auto FilPredictProba(const raft::handle_t& handle,
+                     RfTestParams params,
+                     DataT* X_transpose,
+                     RandomForestMetaData<DataT, LabelT>* forest)
+{
+  std::size_t num_outputs = params.n_labels;
+  auto pred = std::make_shared<thrust::device_vector<float>>(params.n_rows * num_outputs);
+  ModelHandle model;
+  static_assert(std::is_integral_v<LabelT>, "Must be classification");
+  build_treelite_forest(&model, forest, params.n_cols);
+  fil::treelite_params_t tl_params{
+    fil::algo_t::ALGO_AUTO, 0, 0.0f, fil::storage_type_t::AUTO, 8, 1, 0, nullptr};
+  fil::forest_t fil_forest;
+  fil::from_treelite(handle, &fil_forest, model, &tl_params);
+  fil::predict(handle, fil_forest, pred->data().get(), X_transpose, params.n_rows, true);
+  return pred;
+}
 template <typename DataT, typename LabelT>
 auto TrainScore(
   const raft::handle_t& handle, RfTestParams params, DataT* X, DataT* X_transpose, LabelT* y)
@@ -314,6 +332,7 @@ class RfSpecialisedTest {
                 raft::ceildiv(params.n_rows, params.min_samples_leaf));
     }
   }
+
   void TestMinImpurity()
   {
     for (int i = 0u; i < forest->rf_params.n_trees; i++) {
@@ -322,11 +341,11 @@ class RfSpecialisedTest {
       }
     }
   }
-  void TestDeterminism()
 
+  void TestDeterminism()
   {
     // Regression models use floating point atomics, so are not bitwise reproducible
-    bool is_regression = params.split_criterion == MSE || params.split_criterion == MAE;
+    bool is_regression = params.split_criterion != GINI and params.split_criterion != ENTROPY;
     if (is_regression) return;
 
     // Repeat training
@@ -351,6 +370,28 @@ class RfSpecialisedTest {
       }
     }
   }
+
+  // Difference between the largest element and second largest
+  DataT MinDifference(DataT* begin, std::size_t len)
+  {
+    std::size_t max_element_index = 0;
+    DataT max_element             = 0.0;
+    for (std::size_t i = 0; i < len; i++) {
+      if (begin[i] > max_element) {
+        max_element_index = i;
+        max_element       = begin[i];
+      }
+    }
+    DataT second_max_element = 0.0;
+    for (std::size_t i = 0; i < len; i++) {
+      if (begin[i] > second_max_element && i != max_element_index) {
+        second_max_element = begin[i];
+      }
+    }
+
+    return std::abs(max_element - second_max_element);
+  }
+
   // Compare fil against native rf predictions
   // Only for single precision models
   void TestFilPredict()
@@ -360,10 +401,26 @@ class RfSpecialisedTest {
     } else {
       raft::handle_t handle(params.n_streams);
       auto fil_pred = FilPredict(handle, params, X_transpose.data().get(), forest.get());
+
       thrust::host_vector<float> h_fil_pred(*fil_pred);
       thrust::host_vector<float> h_pred(*predictions);
+
+      thrust::host_vector<float> h_fil_pred_prob;
+      if constexpr (std::is_integral_v<LabelT>) {
+        h_fil_pred_prob = *FilPredictProba(handle, params, X_transpose.data().get(), forest.get());
+      }
+
       float tol = 1e-2;
       for (std::size_t i = 0; i < h_fil_pred.size(); i++) {
+        // If the output probabilities are very similar for different classes
+        // FIL may output a different class due to numerical differences
+        // Skip these cases
+        if constexpr (std::is_integral_v<LabelT>) {
+          int num_outputs = forest->trees[0]->num_outputs;
+          auto min_diff   = MinDifference(&h_fil_pred_prob[i * num_outputs], num_outputs);
+          if (min_diff < tol) continue;
+        }
+
         EXPECT_LE(abs(h_fil_pred[i] - h_pred[i]), tol);
       }
     }
@@ -393,7 +450,7 @@ class RfTest : public ::testing::TestWithParam<RfTestParams> {
   void SetUp() override
   {
     RfTestParams params = ::testing::TestWithParam<RfTestParams>::GetParam();
-    bool is_regression  = params.split_criterion == MSE || params.split_criterion == MAE;
+    bool is_regression  = params.split_criterion != GINI and params.split_criterion != ENTROPY;
     if (params.double_precision) {
       if (is_regression) {
         RfSpecialisedTest<double, double> test(params);
@@ -426,9 +483,14 @@ std::vector<int> min_samples_leaf        = {1, 10, 30};
 std::vector<int> min_samples_split       = {2, 10};
 std::vector<float> min_impurity_decrease = {0.0f, 1.0f, 10.0f};
 std::vector<int> n_streams               = {1, 2, 10};
-std::vector<CRITERION> split_criterion   = {CRITERION::MSE, CRITERION::GINI, CRITERION::ENTROPY};
+std::vector<CRITERION> split_criterion   = {CRITERION::INVERSE_GAUSSIAN,
+                                          CRITERION::GAMMA,
+                                          CRITERION::POISSON,
+                                          CRITERION::MSE,
+                                          CRITERION::GINI,
+                                          CRITERION::ENTROPY};
 std::vector<int> seed                    = {0, 17};
-std::vector<int> n_labels                = {2, 10, 30};
+std::vector<int> n_labels                = {2, 10, 20};
 std::vector<bool> double_precision       = {false, true};
 
 int n_tests = 100;
@@ -455,6 +517,7 @@ INSTANTIATE_TEST_CASE_P(RfTests,
                                                                            n_labels,
                                                                            double_precision)));
 
+//-------------------------------------------------------------------------------------------------------------------------------------
 struct QuantileTestParameters {
   int n_rows;
   int n_bins;
@@ -526,7 +589,7 @@ class RFQuantileTest : public ::testing::TestWithParam<QuantileTestParameters> {
     int min_items_per_bin = max_items_per_bin - 1;
     int total_items       = 0;
     for (int b = 0; b < params.n_bins; b++) {
-      ASSERT_TRUE(h_histogram[b] == max_items_per_bin || h_histogram[b] == min_items_per_bin)
+      ASSERT_TRUE(h_histogram[b] == max_items_per_bin or h_histogram[b] == min_items_per_bin)
         << "No. samples in bin[" << b << "] = " << h_histogram[b] << " Expected "
         << max_items_per_bin << " or " << min_items_per_bin << std::endl;
       total_items += h_histogram[b];
@@ -537,7 +600,6 @@ class RFQuantileTest : public ::testing::TestWithParam<QuantileTestParameters> {
   }
 };
 
-//-------------------------------------------------------------------------------------------------------------------------------------
 const std::vector<QuantileTestParameters> inputs = {{1000, 16, 6078587519764079670LLU},
                                                     {1130, 32, 4884670006177930266LLU},
                                                     {1752, 67, 9175325892580481371LLU},
@@ -564,4 +626,529 @@ typedef RFQuantileBinsLowerBoundTest<double> RFQuantileBinsLowerBoundTestD;
 TEST_P(RFQuantileBinsLowerBoundTestD, test) {}
 INSTANTIATE_TEST_CASE_P(RfTests, RFQuantileBinsLowerBoundTestD, ::testing::ValuesIn(inputs));
 
+//------------------------------------------------------------------------------------------------------
+
+TEST(RfTest, TextDump)
+{
+  RF_params rf_params = set_rf_params(2, 2, 1.0, 2, 1, 2, 0.0, true, 1, 1.0, 0, GINI, 1, 128);
+  auto forest         = std::make_shared<RandomForestMetaData<float, int>>();
+
+  std::vector<float> X_host      = {1, 2, 3, 6, 7, 8};
+  thrust::device_vector<float> X = X_host;
+  std::vector<int> y_host        = {0, 0, 1, 1, 1, 0};
+  thrust::device_vector<int> y   = y_host;
+
+  raft::handle_t handle(1);
+  auto forest_ptr = forest.get();
+  fit(handle, forest_ptr, X.data().get(), y.size(), 1, y.data().get(), 2, rf_params);
+
+  std::string expected_start_text = R"(Forest has 1 trees, max_depth 2, and max_leaves 2
+Tree #0
+ Decision Tree depth --> 1 and n_leaves --> 2
+ Tree Fitting - Overall time -->)";
+
+  std::string expected_end_text = R"(└(colid: 0, quesval: 3, best_metric_val: 0.25)
+    ├(leaf, prediction: [0.75, 0.25], best_metric_val: 0)
+    └(leaf, prediction: [0, 1], best_metric_val: 0))";
+
+  EXPECT_TRUE(get_rf_detailed_text(forest_ptr).find(expected_start_text) != std::string::npos);
+  EXPECT_TRUE(get_rf_detailed_text(forest_ptr).find(expected_end_text) != std::string::npos);
+
+  std::string expected_json = R"([
+{"nodeid": 0, "split_feature": 0, "split_threshold": 3, "gain": 0.25, "instance_count": 6, "yes": 1, "no": 2, "children": [
+  {"nodeid": 1, "leaf_value": [0.75, 0.25], "instance_count": 4},
+  {"nodeid": 2, "leaf_value": [0, 1], "instance_count": 2}
+]}
+])";
+  EXPECT_EQ(get_rf_json(forest_ptr), expected_json);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------
+namespace DT {
+
+struct ObjectiveTestParameters {
+  uint64_t seed;
+  int n_rows;
+  int n_bins;
+  int n_classes;
+  int min_samples_leaf;
+  double tolerance;
+};
+
+template <typename ObjectiveT>
+class ObjectiveTest : public ::testing::TestWithParam<ObjectiveTestParameters> {
+  typedef typename ObjectiveT::DataT DataT;
+  typedef typename ObjectiveT::LabelT LabelT;
+  typedef typename ObjectiveT::IdxT IdxT;
+  typedef typename ObjectiveT::BinT BinT;
+
+  ObjectiveTestParameters params;
+
+ public:
+  auto RandUnder(int const end = 10000) { return rand() % end; }
+
+  auto GenRandomData()
+  {
+    std::default_random_engine rng;
+    std::vector<DataT> data(params.n_rows);
+    if constexpr (std::is_same<BinT, CountBin>::value)  // classification case
+    {
+      for (auto& d : data) {
+        d = RandUnder(params.n_classes);
+      }
+    } else {
+      std::normal_distribution<DataT> normal(1.0, 2.0);
+      for (auto& d : data) {
+        auto rand_element{DataT(0)};
+        while (1) {
+          rand_element = normal(rng);
+          if (rand_element > 0) break;  // only positive random numbers
+        }
+        d = rand_element;
+      }
+    }
+    return data;
+  }
+
+  auto GenHist(std::vector<DataT> data)
+  {
+    std::vector<BinT> cdf_hist, pdf_hist;
+
+    for (auto c = 0; c < params.n_classes; ++c) {
+      for (auto b = 0; b < params.n_bins; ++b) {
+        IdxT bin_width  = raft::ceildiv(params.n_rows, params.n_bins);
+        auto data_begin = data.begin() + b * bin_width;
+        auto data_end   = data_begin + bin_width;
+        if constexpr (std::is_same<BinT, CountBin>::value) {  // classification case
+          auto count{IdxT(0)};
+          std::for_each(data_begin, data_end, [&](auto d) {
+            if (d == c) ++count;
+          });
+          pdf_hist.emplace_back(count);
+        } else {  // regression case
+          auto label_sum{DataT(0)};
+          label_sum = std::accumulate(data_begin, data_end, DataT(0));
+          pdf_hist.emplace_back(label_sum, bin_width);
+        }
+
+        auto cumulative = b > 0 ? cdf_hist.back() : BinT();
+        cdf_hist.emplace_back(pdf_hist.empty() ? BinT() : pdf_hist.back());
+        cdf_hist.back() += cumulative;
+      }
+    }
+
+    return std::make_pair(cdf_hist, pdf_hist);
+  }
+
+  auto MSE(std::vector<DataT> const& data)  //  1/n * 1/2 * sum((y - y_pred) * (y - y_pred))
+  {
+    DataT sum        = std::accumulate(data.begin(), data.end(), DataT(0));
+    DataT const mean = sum / data.size();
+    auto mse{DataT(0.0)};  // mse: mean squared error
+
+    std::for_each(data.begin(), data.end(), [&](auto d) {
+      mse += (d - mean) * (d - mean);  // unit deviance
+    });
+
+    mse /= 2 * data.size();
+    return std::make_tuple(mse, sum, DataT(data.size()));
+  }
+
+  auto MSEGroundTruthGain(std::vector<DataT> const& data, std::size_t split_bin_index)
+  {
+    auto bin_width = raft::ceildiv(params.n_rows, params.n_bins);
+    std::vector<DataT> left_data(data.begin(), data.begin() + (split_bin_index + 1) * bin_width);
+    std::vector<DataT> right_data(data.begin() + (split_bin_index + 1) * bin_width, data.end());
+
+    auto [parent_mse, label_sum, n]            = MSE(data);
+    auto [left_mse, label_sum_left, n_left]    = MSE(left_data);
+    auto [right_mse, label_sum_right, n_right] = MSE(right_data);
+
+    auto gain =
+      parent_mse - ((n_left / n) * left_mse +  // the minimizing objective function is half deviance
+                    (n_right / n) * right_mse);  // gain in long form without proxy
+
+    // edge cases
+    if (n_left < params.min_samples_leaf or n_right < params.min_samples_leaf)
+      return -std::numeric_limits<DataT>::max();
+    else
+      return gain;
+  }
+
+  auto InverseGaussianHalfDeviance(
+    std::vector<DataT> const&
+      data)  //  1/n * 2 * sum((y - y_pred) * (y - y_pred)/(y * (y_pred) * (y_pred)))
+  {
+    DataT sum        = std::accumulate(data.begin(), data.end(), DataT(0));
+    DataT const mean = sum / data.size();
+    auto ighd{DataT(0.0)};  // ighd: inverse gaussian half deviance
+
+    std::for_each(data.begin(), data.end(), [&](auto d) {
+      ighd += (d - mean) * (d - mean) / (d * mean * mean);  // unit deviance
+    });
+
+    ighd /= 2 * data.size();
+    return std::make_tuple(ighd, sum, DataT(data.size()));
+  }
+
+  auto InverseGaussianGroundTruthGain(std::vector<DataT> const& data, std::size_t split_bin_index)
+  {
+    auto bin_width = raft::ceildiv(params.n_rows, params.n_bins);
+    std::vector<DataT> left_data(data.begin(), data.begin() + (split_bin_index + 1) * bin_width);
+    std::vector<DataT> right_data(data.begin() + (split_bin_index + 1) * bin_width, data.end());
+
+    auto [parent_ighd, label_sum, n]            = InverseGaussianHalfDeviance(data);
+    auto [left_ighd, label_sum_left, n_left]    = InverseGaussianHalfDeviance(left_data);
+    auto [right_ighd, label_sum_right, n_right] = InverseGaussianHalfDeviance(right_data);
+
+    auto gain = parent_ighd -
+                ((n_left / n) * left_ighd +    // the minimizing objective function is half deviance
+                 (n_right / n) * right_ighd);  // gain in long form without proxy
+
+    // edge cases
+    if (n_left < params.min_samples_leaf or n_right < params.min_samples_leaf or
+        label_sum < ObjectiveT::eps_ or label_sum_right < ObjectiveT::eps_ or
+        label_sum_left < ObjectiveT::eps_)
+      return -std::numeric_limits<DataT>::max();
+    else
+      return gain;
+  }
+
+  auto GammaHalfDeviance(
+    std::vector<DataT> const& data)  //  1/n * 2 * sum(log(y_pred/y_true) + y_true/y_pred - 1)
+  {
+    DataT sum(0);
+    sum              = std::accumulate(data.begin(), data.end(), DataT(0));
+    DataT const mean = sum / data.size();
+    DataT ghd(0);  // gamma half deviance
+
+    std::for_each(data.begin(), data.end(), [&](auto& element) {
+      auto log_y = raft::myLog(element ? element : DataT(1.0));
+      ghd += raft::myLog(mean) - log_y + element / mean - 1;
+    });
+
+    ghd /= data.size();
+    return std::make_tuple(ghd, sum, DataT(data.size()));
+  }
+
+  auto GammaGroundTruthGain(std::vector<DataT> const& data, std::size_t split_bin_index)
+  {
+    auto bin_width = raft::ceildiv(params.n_rows, params.n_bins);
+    std::vector<DataT> left_data(data.begin(), data.begin() + (split_bin_index + 1) * bin_width);
+    std::vector<DataT> right_data(data.begin() + (split_bin_index + 1) * bin_width, data.end());
+
+    auto [parent_ghd, label_sum, n]            = GammaHalfDeviance(data);
+    auto [left_ghd, label_sum_left, n_left]    = GammaHalfDeviance(left_data);
+    auto [right_ghd, label_sum_right, n_right] = GammaHalfDeviance(right_data);
+
+    auto gain =
+      parent_ghd - ((n_left / n) * left_ghd +  // the minimizing objective function is half deviance
+                    (n_right / n) * right_ghd);  // gain in long form without proxy
+
+    // edge cases
+    if (n_left < params.min_samples_leaf or n_right < params.min_samples_leaf or
+        label_sum < ObjectiveT::eps_ or label_sum_right < ObjectiveT::eps_ or
+        label_sum_left < ObjectiveT::eps_)
+      return -std::numeric_limits<DataT>::max();
+    else
+      return gain;
+  }
+
+  auto PoissonHalfDeviance(
+    std::vector<DataT> const& data)  //  1/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true)
+  {
+    DataT sum       = std::accumulate(data.begin(), data.end(), DataT(0));
+    auto const mean = sum / data.size();
+    auto poisson_half_deviance{DataT(0.0)};
+
+    std::for_each(data.begin(), data.end(), [&](auto d) {
+      auto log_y = raft::myLog(d ? d : DataT(1.0));  // we don't want nans
+      poisson_half_deviance += d * (log_y - raft::myLog(mean)) + mean - d;
+    });
+
+    poisson_half_deviance /= data.size();
+    return std::make_tuple(poisson_half_deviance, sum, DataT(data.size()));
+  }
+
+  auto PoissonGroundTruthGain(std::vector<DataT> const& data, std::size_t split_bin_index)
+  {
+    auto bin_width = raft::ceildiv(params.n_rows, params.n_bins);
+    std::vector<DataT> left_data(data.begin(), data.begin() + (split_bin_index + 1) * bin_width);
+    std::vector<DataT> right_data(data.begin() + (split_bin_index + 1) * bin_width, data.end());
+
+    auto [parent_phd, label_sum, n]            = PoissonHalfDeviance(data);
+    auto [left_phd, label_sum_left, n_left]    = PoissonHalfDeviance(left_data);
+    auto [right_phd, label_sum_right, n_right] = PoissonHalfDeviance(right_data);
+
+    auto gain = parent_phd - ((n_left / n) * left_phd +
+                              (n_right / n) * right_phd);  // gain in long form without proxy
+
+    // edge cases
+    if (n_left < params.min_samples_leaf or n_right < params.min_samples_leaf or
+        label_sum < ObjectiveT::eps_ or label_sum_right < ObjectiveT::eps_ or
+        label_sum_left < ObjectiveT::eps_)
+      return -std::numeric_limits<DataT>::max();
+    else
+      return gain;
+  }
+
+  auto Entropy(std::vector<DataT> const& data)
+  {  // sum((n_c/n_total)*(log(n_c/n_total)))
+    DataT entropy(0);
+    for (auto c = 0; c < params.n_classes; ++c) {
+      IdxT sum(0);
+      std::for_each(data.begin(), data.end(), [&](auto d) {
+        if (d == DataT(c)) ++sum;
+      });
+      DataT class_proba = DataT(sum) / data.size();
+      entropy += -class_proba * raft::myLog(class_proba ? class_proba : DataT(1)) /
+                 raft::myLog(DataT(2));  // adding gain
+    }
+    return entropy;
+  }
+
+  auto EntropyGroundTruthGain(std::vector<DataT> const& data, std::size_t const split_bin_index)
+  {
+    auto bin_width = raft::ceildiv(params.n_rows, params.n_bins);
+    std::vector<DataT> left_data(data.begin(), data.begin() + (split_bin_index + 1) * bin_width);
+    std::vector<DataT> right_data(data.begin() + (split_bin_index + 1) * bin_width, data.end());
+
+    auto parent_entropy = Entropy(data);
+    auto left_entropy   = Entropy(left_data);
+    auto right_entropy  = Entropy(right_data);
+    DataT n             = data.size();
+    DataT left_n        = left_data.size();
+    DataT right_n       = right_data.size();
+
+    auto gain = parent_entropy - ((left_n / n) * left_entropy + (right_n / n) * right_entropy);
+
+    // edge cases
+    if (left_n < params.min_samples_leaf or right_n < params.min_samples_leaf) {
+      return -std::numeric_limits<DataT>::max();
+    } else {
+      return gain;
+    }
+  }
+
+  auto GiniImpurity(std::vector<DataT> const& data)
+  {  // sum((n_c/n_total)(1-(n_c/n_total)))
+    DataT gini(0);
+    for (auto c = 0; c < params.n_classes; ++c) {
+      IdxT sum(0);
+      std::for_each(data.begin(), data.end(), [&](auto d) {
+        if (d == DataT(c)) ++sum;
+      });
+      DataT class_proba = DataT(sum) / data.size();
+      gini += class_proba * (1 - class_proba);  // adding gain
+    }
+    return gini;
+  }
+
+  auto GiniGroundTruthGain(std::vector<DataT> const& data, std::size_t const split_bin_index)
+  {
+    auto bin_width = raft::ceildiv(params.n_rows, params.n_bins);
+    std::vector<DataT> left_data(data.begin(), data.begin() + (split_bin_index + 1) * bin_width);
+    std::vector<DataT> right_data(data.begin() + (split_bin_index + 1) * bin_width, data.end());
+
+    auto parent_gini = GiniImpurity(data);
+    auto left_gini   = GiniImpurity(left_data);
+    auto right_gini  = GiniImpurity(right_data);
+    DataT n          = data.size();
+    DataT left_n     = left_data.size();
+    DataT right_n    = right_data.size();
+
+    auto gain = parent_gini - ((left_n / n) * left_gini + (right_n / n) * right_gini);
+
+    // edge cases
+    if (left_n < params.min_samples_leaf or right_n < params.min_samples_leaf) {
+      return -std::numeric_limits<DataT>::max();
+    } else {
+      return gain;
+    }
+  }
+
+  auto GroundTruthGain(std::vector<DataT> const& data, std::size_t const split_bin_index)
+  {
+    if constexpr (std::is_same<ObjectiveT, MSEObjectiveFunction<DataT, LabelT, IdxT>>::
+                    value)  // mean squared error
+    {
+      return MSEGroundTruthGain(data, split_bin_index);
+    } else if constexpr (std::is_same<ObjectiveT, PoissonObjectiveFunction<DataT, LabelT, IdxT>>::
+                           value)  // poisson
+    {
+      return PoissonGroundTruthGain(data, split_bin_index);
+    } else if constexpr (std::is_same<ObjectiveT,
+                                      GammaObjectiveFunction<DataT, LabelT, IdxT>>::value)  // gamma
+    {
+      return GammaGroundTruthGain(data, split_bin_index);
+    } else if constexpr (std::is_same<ObjectiveT,
+                                      InverseGaussianObjectiveFunction<DataT, LabelT, IdxT>>::
+                           value)  // inverse gaussian
+    {
+      return InverseGaussianGroundTruthGain(data, split_bin_index);
+    } else if constexpr (std::is_same<ObjectiveT, EntropyObjectiveFunction<DataT, LabelT, IdxT>>::
+                           value)  // entropy
+    {
+      return EntropyGroundTruthGain(data, split_bin_index);
+    } else if constexpr (std::is_same<ObjectiveT,
+                                      GiniObjectiveFunction<DataT, LabelT, IdxT>>::value)  // gini
+    {
+      return GiniGroundTruthGain(data, split_bin_index);
+    }
+    return DataT(0.0);
+  }
+
+  auto NumLeftOfBin(std::vector<BinT> const& cdf_hist, IdxT idx)
+  {
+    auto count{IdxT(0)};
+    for (auto c = 0; c < params.n_classes; ++c) {
+      if constexpr (std::is_same<BinT, CountBin>::value)  // countbin
+      {
+        count += cdf_hist[params.n_bins * c + idx].x;
+      } else  // aggregatebin
+      {
+        count += cdf_hist[params.n_bins * c + idx].count;
+      }
+    }
+    return count;
+  }
+
+  void SetUp() override
+  {
+    srand(params.seed);
+    params = ::testing::TestWithParam<ObjectiveTestParameters>::GetParam();
+    ObjectiveT objective(params.n_classes, params.min_samples_leaf);
+
+    auto data                 = GenRandomData();
+    auto [cdf_hist, pdf_hist] = GenHist(data);
+    auto split_bin_index      = RandUnder(params.n_bins);
+    auto ground_truth_gain    = GroundTruthGain(data, split_bin_index);
+
+    auto hypothesis_gain = objective.GainPerSplit(&cdf_hist[0],
+                                                  split_bin_index,
+                                                  params.n_bins,
+                                                  NumLeftOfBin(cdf_hist, params.n_bins - 1),
+                                                  NumLeftOfBin(cdf_hist, split_bin_index));
+
+    ASSERT_NEAR(ground_truth_gain, hypothesis_gain, params.tolerance);
+  }
+};
+
+const std::vector<ObjectiveTestParameters> mse_objective_test_parameters = {
+  {9507819643927052255LLU, 2048, 64, 1, 0, 0.00001},
+  {9507819643927052259LLU, 2048, 128, 1, 1, 0.00001},
+  {9507819643927052251LLU, 2048, 256, 1, 1, 0.00001},
+  {9507819643927052258LLU, 2048, 512, 1, 5, 0.00001},
+};
+
+const std::vector<ObjectiveTestParameters> poisson_objective_test_parameters = {
+  {9507819643927052255LLU, 2048, 64, 1, 0, 0.00001},
+  {9507819643927052259LLU, 2048, 128, 1, 1, 0.00001},
+  {9507819643927052251LLU, 2048, 256, 1, 1, 0.00001},
+  {9507819643927052258LLU, 2048, 512, 1, 5, 0.00001},
+};
+
+const std::vector<ObjectiveTestParameters> gamma_objective_test_parameters = {
+  {9507819643927052255LLU, 2048, 64, 1, 0, 0.00001},
+  {9507819643927052259LLU, 2048, 128, 1, 1, 0.00001},
+  {9507819643927052251LLU, 2048, 256, 1, 1, 0.00001},
+  {9507819643927052258LLU, 2048, 512, 1, 5, 0.00001},
+};
+
+const std::vector<ObjectiveTestParameters> invgauss_objective_test_parameters = {
+  {9507819643927052255LLU, 2048, 64, 1, 0, 0.00001},
+  {9507819643927052259LLU, 2048, 128, 1, 1, 0.00001},
+  {9507819643927052251LLU, 2048, 256, 1, 1, 0.00001},
+  {9507819643927052258LLU, 2048, 512, 1, 5, 0.00001},
+};
+
+const std::vector<ObjectiveTestParameters> entropy_objective_test_parameters = {
+  {9507819643927052255LLU, 2048, 64, 2, 0, 0.00001},
+  {9507819643927052256LLU, 2048, 128, 10, 1, 0.00001},
+  {9507819643927052257LLU, 2048, 256, 100, 1, 0.00001},
+  {9507819643927052258LLU, 2048, 512, 100, 5, 0.00001},
+};
+
+const std::vector<ObjectiveTestParameters> gini_objective_test_parameters = {
+  {9507819643927052255LLU, 2048, 64, 2, 0, 0.00001},
+  {9507819643927052256LLU, 2048, 128, 10, 1, 0.00001},
+  {9507819643927052257LLU, 2048, 256, 100, 1, 0.00001},
+  {9507819643927052258LLU, 2048, 512, 100, 5, 0.00001},
+};
+
+// mse objective test
+typedef ObjectiveTest<MSEObjectiveFunction<double, double, int>> MSEObjectiveTestD;
+TEST_P(MSEObjectiveTestD, MSEObjectiveTest) {}
+INSTANTIATE_TEST_CASE_P(RfTests,
+                        MSEObjectiveTestD,
+                        ::testing::ValuesIn(mse_objective_test_parameters));
+typedef ObjectiveTest<MSEObjectiveFunction<float, float, int>> MSEObjectiveTestF;
+TEST_P(MSEObjectiveTestF, MSEObjectiveTest) {}
+INSTANTIATE_TEST_CASE_P(RfTests,
+                        MSEObjectiveTestF,
+                        ::testing::ValuesIn(mse_objective_test_parameters));
+
+// poisson objective test
+typedef ObjectiveTest<PoissonObjectiveFunction<double, double, int>> PoissonObjectiveTestD;
+TEST_P(PoissonObjectiveTestD, poissonObjectiveTest) {}
+INSTANTIATE_TEST_CASE_P(RfTests,
+                        PoissonObjectiveTestD,
+                        ::testing::ValuesIn(poisson_objective_test_parameters));
+typedef ObjectiveTest<PoissonObjectiveFunction<float, float, int>> PoissonObjectiveTestF;
+TEST_P(PoissonObjectiveTestF, poissonObjectiveTest) {}
+INSTANTIATE_TEST_CASE_P(RfTests,
+                        PoissonObjectiveTestF,
+                        ::testing::ValuesIn(poisson_objective_test_parameters));
+
+// gamma objective test
+typedef ObjectiveTest<GammaObjectiveFunction<double, double, int>> GammaObjectiveTestD;
+TEST_P(GammaObjectiveTestD, GammaObjectiveTest) {}
+INSTANTIATE_TEST_CASE_P(RfTests,
+                        GammaObjectiveTestD,
+                        ::testing::ValuesIn(gamma_objective_test_parameters));
+typedef ObjectiveTest<GammaObjectiveFunction<float, float, int>> GammaObjectiveTestF;
+TEST_P(GammaObjectiveTestF, GammaObjectiveTest) {}
+INSTANTIATE_TEST_CASE_P(RfTests,
+                        GammaObjectiveTestF,
+                        ::testing::ValuesIn(gamma_objective_test_parameters));
+
+// InvGauss objective test
+typedef ObjectiveTest<InverseGaussianObjectiveFunction<double, double, int>>
+  InverseGaussianObjectiveTestD;
+TEST_P(InverseGaussianObjectiveTestD, InverseGaussianObjectiveTest) {}
+INSTANTIATE_TEST_CASE_P(RfTests,
+                        InverseGaussianObjectiveTestD,
+                        ::testing::ValuesIn(invgauss_objective_test_parameters));
+typedef ObjectiveTest<InverseGaussianObjectiveFunction<float, float, int>>
+  InverseGaussianObjectiveTestF;
+TEST_P(InverseGaussianObjectiveTestF, InverseGaussianObjectiveTest) {}
+INSTANTIATE_TEST_CASE_P(RfTests,
+                        InverseGaussianObjectiveTestF,
+                        ::testing::ValuesIn(invgauss_objective_test_parameters));
+
+// entropy objective test
+typedef ObjectiveTest<EntropyObjectiveFunction<double, int, int>> EntropyObjectiveTestD;
+TEST_P(EntropyObjectiveTestD, entropyObjectiveTest) {}
+INSTANTIATE_TEST_CASE_P(RfTests,
+                        EntropyObjectiveTestD,
+                        ::testing::ValuesIn(entropy_objective_test_parameters));
+typedef ObjectiveTest<EntropyObjectiveFunction<float, int, int>> EntropyObjectiveTestF;
+TEST_P(EntropyObjectiveTestF, entropyObjectiveTest) {}
+INSTANTIATE_TEST_CASE_P(RfTests,
+                        EntropyObjectiveTestF,
+                        ::testing::ValuesIn(entropy_objective_test_parameters));
+
+// gini objective test
+typedef ObjectiveTest<GiniObjectiveFunction<double, int, int>> GiniObjectiveTestD;
+TEST_P(GiniObjectiveTestD, giniObjectiveTest) {}
+INSTANTIATE_TEST_CASE_P(RfTests,
+                        GiniObjectiveTestD,
+                        ::testing::ValuesIn(gini_objective_test_parameters));
+typedef ObjectiveTest<GiniObjectiveFunction<float, int, int>> GiniObjectiveTestF;
+TEST_P(GiniObjectiveTestF, giniObjectiveTest) {}
+INSTANTIATE_TEST_CASE_P(RfTests,
+                        GiniObjectiveTestF,
+                        ::testing::ValuesIn(gini_objective_test_parameters));
+
+}  // end namespace DT
 }  // end namespace ML
diff --git a/cpp/test/sg/rf_treelite_test.cu b/cpp/test/sg/rf_treelite_test.cu
deleted file mode 100644
index 89e75eea2d..0000000000
--- a/cpp/test/sg/rf_treelite_test.cu
+++ /dev/null
@@ -1,570 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <test_utils.h>
-
-#include <decisiontree/treelite_util.h>
-#include <decisiontree/decisiontree.cuh>
-
-#include <cuml/ensemble/randomforest.hpp>
-
-#include <raft/cudart_utils.h>
-#include <raft/linalg/gemv.h>
-#include <raft/linalg/transpose.h>
-#include <raft/cuda_utils.cuh>
-#include <raft/random/rng.cuh>
-
-#include <treelite/c_api.h>
-#include <treelite/c_api_runtime.h>
-
-#include <gtest/gtest.h>
-
-#include <sys/stat.h>
-#include <cstdlib>
-#include <fstream>
-#include <iostream>
-#include <limits>
-#include <string>
-
-namespace ML {
-
-using namespace MLCommon;
-
-template <typename T>  // template useless for now.
-struct RfInputs {
-  int n_rows;
-  int n_cols;
-  int n_trees;
-  float max_features;
-  float max_samples;
-  int n_inference_rows;
-  int max_depth;
-  int max_leaves;
-  bool bootstrap;
-  int n_bins;
-  int min_samples_leaf;
-  int min_samples_split;
-  float min_impurity_decrease;
-  int n_streams;
-  CRITERION split_criterion;
-};
-
-template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const RfInputs<T>& dims)
-{
-  return os;
-}
-
-template <typename T, typename L>
-class RfTreeliteTestCommon : public ::testing::TestWithParam<RfInputs<T>> {
- protected:
-  void ConcatenateTreeliteModels()
-  {
-    // Test the implementation for converting fitted forest into treelite format.
-    ModelHandle concatenated_forest_handle;
-    concatenated_forest_handle = concatenate_trees(treelite_indiv_handles);
-    compare_concat_forest_to_subforests(concatenated_forest_handle, treelite_indiv_handles);
-    std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
-    // Get the test index from Google current_test_info.
-    // The test index is the string after '/' in test_name.
-    std::string index_str = test_name.substr(test_name.find("/") + 1, test_name.length());
-
-    // Create a directory if the test is the first one in the test case.
-    int mkdir_ret = mkdir(test_dir.c_str(), 0700);
-    if (mkdir_ret != 0) {
-      // Ignore the error if the error is caused by EEXIST.
-      // Treelite will generate errors when the directory is not accessible.
-      ASSERT(errno == EEXIST, "Call mkdir %s fails.", test_dir.c_str());
-    }
-
-    // Create a sub-directory for the test case.
-    dir_name = test_dir + index_str;
-
-    CompilerHandle compiler;
-    // "ast_navive" is the default compiler treelite used in their Python code.
-    TREELITE_CHECK(TreeliteCompilerCreateV2("ast_native", "{}", &compiler));
-
-    // Generate C code in the directory specified below.
-    // The parallel comilplation is disabled. To enable it, one needs to specify parallel_comp of
-    // CompilerHandle. Treelite will create a directory if it doesn't exist.
-    TREELITE_CHECK(
-      TreeliteCompilerGenerateCodeV2(compiler, treelite_indiv_handles[0], dir_name.c_str()));
-    TREELITE_CHECK(TreeliteCompilerFree(compiler));
-
-    // Options copied from
-    // https://github.com/dmlc/treelite/blob/528d883f8f39eb5dd633e929b95915b63e210b39/python/treelite/contrib/__init__.py.
-    std::string obj_cmd = "gcc -c -O3 -o " + dir_name + "/main.o " + dir_name +
-                          "/main.c -fPIC "
-                          "-std=c99 -lm";
-
-    std::string lib_cmd =
-      "gcc -shared -O3 -o " + dir_name + "/treelite_model.so " + dir_name + "/main.o -std=c99 -lm";
-
-    ASSERT(system(obj_cmd.c_str()) == 0, "Call %s fails.", obj_cmd.c_str());
-    ASSERT(system(lib_cmd.c_str()) == 0, "Call %s fails.", lib_cmd.c_str());
-
-    PredictorHandle predictor;
-    std::string lib_path = dir_name + "/treelite_model.so";
-
-    // -1 means use maximum possible worker threads.
-    int worker_thread = -1;
-    TREELITE_CHECK(TreelitePredictorLoad(lib_path.c_str(), worker_thread, &predictor));
-
-    DMatrixHandle dmat;
-    // Current RF doesn't seem to support missing value, put NaN to be safe.
-    T missing_value = std::numeric_limits<T>::quiet_NaN();
-    TREELITE_CHECK(TreeliteDMatrixCreateFromMat(inference_data_h.data(),
-                                                ML::DT::TreeliteType<T>::value,
-                                                params.n_inference_rows,
-                                                params.n_cols,
-                                                &missing_value,
-                                                &dmat));
-
-    // Use dense batch so batch_sparse is 0.
-    // pred_margin = true means to produce raw margins rather than transformed probability.
-    bool pred_margin = false;
-    // Allocate larger array for treelite predicted label with using multi-class classification to
-    // avoid seg faults. Altough later we only use first params.n_inference_rows elements.
-    size_t treelite_predicted_labels_size;
-
-    int verbose = 0;
-    TREELITE_CHECK(TreelitePredictorPredictBatch(predictor,
-                                                 dmat,
-                                                 verbose,
-                                                 pred_margin,
-                                                 treelite_predicted_labels.data(),
-                                                 &treelite_predicted_labels_size));
-
-    TREELITE_CHECK(TreeliteDMatrixFree(dmat));
-    TREELITE_CHECK(TreelitePredictorFree(predictor));
-    TREELITE_CHECK(TreeliteFreeModel(concatenated_forest_handle));
-    TREELITE_CHECK(TreeliteFreeModel(treelite_indiv_handles[0]));
-    TREELITE_CHECK(TreeliteFreeModel(treelite_indiv_handles[1]));
-    TREELITE_CHECK(TreeliteFreeModel(treelite_indiv_handles[2]));
-  }
-
-  void getResultAndCheck()
-  {
-    // Predict and compare against known labels
-    predict(*handle,
-            forest,
-            inference_data_d,
-            params.n_inference_rows,
-            params.n_cols,
-            predicted_labels_d);
-    score(*handle, forest, labels_d, params.n_inference_rows, predicted_labels_d);
-
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-
-    predicted_labels_h.resize(params.n_inference_rows);
-    ref_predicted_labels.resize(params.n_inference_rows);
-
-    raft::update_host(
-      predicted_labels_h.data(), predicted_labels_d, params.n_inference_rows, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-
-    for (int i = 0; i < params.n_inference_rows; i++) {
-      if (is_classification) {
-        ref_predicted_labels[i]      = static_cast<float>(predicted_labels_h[i]);
-        treelite_predicted_labels[i] = treelite_predicted_labels[i] >= 0.5 ? 1 : 0;
-      } else {
-        ref_predicted_labels[i] = static_cast<float>(predicted_labels_h[i]);
-      }
-    }
-
-    EXPECT_TRUE(raft::devArrMatchHost(ref_predicted_labels.data(),
-                                      treelite_predicted_labels.data(),
-                                      params.n_inference_rows,
-                                      raft::Compare<float>(),
-                                      stream));
-  }
-
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<RfInputs<T>>::GetParam();
-
-    rf_params = set_rf_params(params.max_depth,
-                              params.max_leaves,
-                              params.max_features,
-                              params.n_bins,
-                              params.min_samples_leaf,
-                              params.min_samples_split,
-                              params.min_impurity_decrease,
-                              params.bootstrap,
-                              params.n_trees,
-                              params.max_samples,
-                              0,
-                              params.split_criterion,
-                              params.n_streams,
-                              128);
-
-    handle.reset(new raft::handle_t(rf_params.n_streams));
-
-    data_len           = params.n_rows * params.n_cols;
-    inference_data_len = params.n_inference_rows * params.n_cols;
-
-    raft::allocate(data_d, data_len, stream);
-    raft::allocate(inference_data_d, inference_data_len, stream);
-
-    raft::allocate(labels_d, params.n_rows, stream);
-    raft::allocate(predicted_labels_d, params.n_inference_rows, stream);
-
-    treelite_predicted_labels.resize(params.n_inference_rows);
-    ref_predicted_labels.resize(params.n_inference_rows);
-
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    handle->set_stream(stream);
-
-    forest          = new typename ML::RandomForestMetaData<T, L>;
-    forest_2        = new typename ML::RandomForestMetaData<T, L>;
-    forest_3        = new typename ML::RandomForestMetaData<T, L>;
-    all_forest_info = {forest, forest_2, forest_3};
-    data_h.resize(data_len);
-    inference_data_h.resize(inference_data_len);
-
-    // Random number generator.
-    raft::random::Rng r1(1234ULL);
-    // Generate data_d is in column major order.
-    r1.uniform(data_d, data_len, T(0.0), T(10.0), stream);
-    raft::random::Rng r2(4321ULL);
-    // Generate inference_data_d which is in row major order.
-    r2.uniform(inference_data_d, inference_data_len, T(0.0), T(10.0), stream);
-
-    raft::update_host(data_h.data(), data_d, data_len, stream);
-    raft::update_host(inference_data_h.data(), inference_data_d, inference_data_len, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-  }
-
-  void TearDown() override
-  {
-    CUDA_CHECK(cudaStreamDestroy(stream));
-
-    CUDA_CHECK(cudaFree(data_d));
-    CUDA_CHECK(cudaFree(inference_data_d));
-    CUDA_CHECK(cudaFree(labels_d));
-    CUDA_CHECK(cudaFree(predicted_labels_d));
-
-    delete forest;
-    delete forest_2;
-    delete forest_3;
-    all_forest_info.clear();
-    labels_h.clear();
-    predicted_labels_h.clear();
-    data_h.clear();
-    inference_data_h.clear();
-    treelite_predicted_labels.clear();
-    ref_predicted_labels.clear();
-    treelite_indiv_handles.clear();
-  }
-
- protected:
-  RfInputs<T> params;
-  RF_params rf_params;
-  T *data_d, *inference_data_d;
-  std::vector<T> data_h;
-  std::vector<T> inference_data_h;
-  std::vector<ModelHandle> treelite_indiv_handles;
-
-  // Set to 1 for regression and 2 for binary classification
-  // #class for multi-classification
-  int task_category;
-  int is_classification;
-
-  int data_len;
-  int inference_data_len;
-
-  cudaStream_t stream = 0;
-  std::shared_ptr<raft::handle_t> handle;
-  std::vector<float> treelite_predicted_labels;
-  std::vector<float> ref_predicted_labels;
-  std::vector<ML::RandomForestMetaData<T, L>*> all_forest_info;
-  std::string test_dir;
-  std::string dir_name;
-
-  L *labels_d, *predicted_labels_d;
-  std::vector<L> labels_h;
-  std::vector<L> predicted_labels_h;
-
-  RandomForestMetaData<T, L>* forest;
-  RandomForestMetaData<T, L>* forest_2;
-  RandomForestMetaData<T, L>* forest_3;
-};  // namespace ML
-
-template <typename T, typename L>
-class RfConcatTestClf : public RfTreeliteTestCommon<T, L> {
- protected:
-  void testClassifier()
-  {
-    this->test_dir          = "./concat_test_clf/";
-    this->is_classification = 1;
-    // task_category - 1 for regression, 2 for binary classification
-    // #class for multi-class classification
-    this->task_category = 2;
-
-    std::vector<float> temp_label_h;
-
-    cudaStream_t stream = 0;
-    CUDA_CHECK(cudaStreamCreate(&stream));
-
-    rmm::device_uvector<float> weight(this->params.n_cols, stream);
-    rmm::device_uvector<float> temp_label_d(this->params.n_rows, stream);
-    rmm::device_uvector<float> temp_data_d(this->data_len, stream);
-
-    raft::random::Rng r(1234ULL);
-
-    // Generate weight for each feature.
-    r.uniform(weight.data(), this->params.n_cols, T(0.0), T(1.0), this->stream);
-    // Generate noise.
-    r.uniform(temp_label_d.data(), this->params.n_rows, T(0.0), T(10.0), this->stream);
-
-    raft::linalg::transpose<float>(*(this->handle),
-                                   this->data_d,
-                                   temp_data_d.data(),
-                                   this->params.n_rows,
-                                   this->params.n_cols,
-                                   this->stream);
-
-    raft::linalg::gemv<float>(*(this->handle),
-                              temp_data_d.data(),
-                              this->params.n_cols,
-                              this->params.n_rows,
-                              weight.data(),
-                              temp_label_d.data(),
-                              true,
-                              1.f,
-                              1.f,
-                              this->stream);
-
-    temp_label_h.resize(this->params.n_rows);
-    raft::update_host(temp_label_h.data(), temp_label_d.data(), this->params.n_rows, this->stream);
-
-    CUDA_CHECK(cudaStreamSynchronize(this->stream));
-
-    int value;
-    for (int i = 0; i < this->params.n_rows; i++) {
-      // The value of temp_label is between 0 to 10*n_cols+noise_level(10).
-      // Choose half of that as the theshold to balance two classes.
-      if (temp_label_h[i] >= (10 * this->params.n_cols + 10) / 2.0) {
-        value = 1;
-      } else {
-        value = 0;
-      }
-      this->labels_h.push_back(value);
-    }
-
-    raft::update_device(this->labels_d, this->labels_h.data(), this->params.n_rows, this->stream);
-
-    preprocess_labels(this->params.n_rows, this->labels_h, labels_map);
-
-    for (int i = 0; i < 3; i++) {
-      ModelHandle model;
-
-      this->rf_params.n_trees = this->rf_params.n_trees + i;
-
-      fit(*(this->handle),
-          this->all_forest_info[i],
-          this->data_d,
-          this->params.n_rows,
-          this->params.n_cols,
-          this->labels_d,
-          labels_map.size(),
-          this->rf_params);
-      build_treelite_forest(
-        &model, this->all_forest_info[i], this->params.n_cols, this->task_category);
-      this->treelite_indiv_handles.push_back(model);
-    }
-
-    CUDA_CHECK(cudaStreamSynchronize(this->stream));
-
-    this->ConcatenateTreeliteModels();
-    this->getResultAndCheck();
-
-    postprocess_labels(this->params.n_rows, this->labels_h, this->labels_map);
-
-    labels_map.clear();
-    temp_label_h.clear();
-  }
-
- protected:
-  std::map<int, int> labels_map;  // unique map of labels to int vals starting from 0
-};
-
-//-------------------------------------------------------------------------------------------------------------------------------------
-template <typename T, typename L>
-class RfConcatTestReg : public RfTreeliteTestCommon<T, L> {
- protected:
-  void testRegressor()
-  {
-    this->test_dir          = "./concat_test_reg/";
-    this->is_classification = 0;
-    // task_category - 1 for regression, 2 for binary classification
-    // #class for multi-class classification
-    this->task_category = 1;
-
-    cudaStream_t stream = 0;
-    CUDA_CHECK(cudaStreamCreate(&stream));
-
-    rmm::device_uvector<float> weight(this->params.n_cols, stream);
-    rmm::device_uvector<float> temp_data_d(this->data_len, stream);
-
-    raft::random::Rng r(1234ULL);
-
-    // Generate weight for each feature.
-    r.uniform(weight.data(), this->params.n_cols, T(0.0), T(1.0), this->stream);
-    // Generate noise.
-    r.uniform(this->labels_d, this->params.n_rows, T(0.0), T(10.0), this->stream);
-
-    raft::linalg::transpose<float>(*(this->handle),
-                                   this->data_d,
-                                   temp_data_d.data(),
-                                   this->params.n_rows,
-                                   this->params.n_cols,
-                                   this->stream);
-
-    raft::linalg::gemv<float>(*(this->handle),
-                              temp_data_d.data(),
-                              this->params.n_cols,
-                              this->params.n_rows,
-                              weight.data(),
-                              this->labels_d,
-                              true,
-                              1.f,
-                              1.f,
-                              this->stream);
-
-    this->labels_h.resize(this->params.n_rows);
-    raft::update_host(this->labels_h.data(), this->labels_d, this->params.n_rows, this->stream);
-    CUDA_CHECK(cudaStreamSynchronize(this->stream));
-
-    for (int i = 0; i < 3; i++) {
-      ModelHandle model;
-
-      this->rf_params.n_trees = this->rf_params.n_trees + i;
-
-      fit(*(this->handle),
-          this->all_forest_info[i],
-          this->data_d,
-          this->params.n_rows,
-          this->params.n_cols,
-          this->labels_d,
-          this->rf_params);
-      build_treelite_forest(
-        &model, this->all_forest_info[i], this->params.n_cols, this->task_category);
-      CUDA_CHECK(cudaStreamSynchronize(this->stream));
-      this->treelite_indiv_handles.push_back(model);
-    }
-
-    this->ConcatenateTreeliteModels();
-    this->getResultAndCheck();
-  }
-};
-
-// //-------------------------------------------------------------------------------------------------------------------------------------
-const std::vector<RfInputs<float>> inputsf2_clf = {
-  {4, 2, 1, 1.0f, 1.0f, 4, 8, -1, false, 4, 2, 2, 0.0, 2, CRITERION::GINI},  // single tree forest,
-                                                                             // bootstrap false,
-                                                                             // depth 8, 4 bins
-  {4, 2, 1, 1.0f, 1.0f, 4, 8, -1, false, 4, 2, 2, 0.0, 2, CRITERION::GINI},  // single tree forest,
-                                                                             // bootstrap false,
-                                                                             // depth of 8, 4 bins
-  {4,
-   2,
-   10,
-   1.0f,
-   1.0f,
-   4,
-   8,
-   -1,
-   false,
-   4,
-   2,
-   2,
-   0.0,
-   2,
-   CRITERION::GINI},  // forest with 10 trees, all trees should produce identical predictions (no
-                      // bootstrapping or column subsampling)
-  {4,
-   2,
-   10,
-   0.8f,
-   0.8f,
-   4,
-   8,
-   -1,
-   true,
-   3,
-   2,
-   2,
-   0.0,
-   2,
-   CRITERION::GINI},  // forest with 10 trees, with bootstrap and column subsampling enabled, 3 bins
-  {4,
-   2,
-   10,
-   0.8f,
-   0.8f,
-   4,
-   8,
-   -1,
-   true,
-   3,
-   2,
-   2,
-   0.0,
-   2,
-   CRITERION::CRITERION_END},  // forest with 10 trees, with bootstrap and column subsampling
-                               // enabled, 3 bins, different split algorithm
-  {4, 2, 1, 1.0f, 1.0f, 4, 8, -1, false, 4, 2, 2, 0.0, 2, CRITERION::ENTROPY},
-  {4, 2, 1, 1.0f, 1.0f, 4, 8, -1, false, 4, 2, 2, 0.0, 2, CRITERION::ENTROPY},
-  {4, 2, 10, 1.0f, 1.0f, 4, 8, -1, false, 4, 2, 2, 0.0, 2, CRITERION::ENTROPY},
-  {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, 3, 2, 2, 0.0, 2, CRITERION::ENTROPY},
-  {4, 2, 10, 0.8f, 0.8f, 4, 8, -1, true, 3, 2, 2, 0.0, 2, CRITERION::ENTROPY}};
-
-typedef RfConcatTestClf<float, int> RfClassifierConcatTestF;
-TEST_P(RfClassifierConcatTestF, Convert_Clf) { testClassifier(); }
-
-INSTANTIATE_TEST_CASE_P(RfBinaryClassifierConcatTests,
-                        RfClassifierConcatTestF,
-                        ::testing::ValuesIn(inputsf2_clf));
-
-const std::vector<RfInputs<float>> inputsf2_reg = {
-  {4, 2, 1, 1.0f, 1.0f, 4, 7, -1, false, 4, 2, 2, 0.0, 2, CRITERION::MSE},
-  {4, 2, 1, 1.0f, 1.0f, 4, 7, -1, false, 4, 2, 2, 0.0, 2, CRITERION::MSE},
-  {4,
-   2,
-   5,
-   1.0f,
-   1.0f,
-   4,
-   7,
-   -1,
-   false,
-   4,
-   2,
-   2,
-   0.0,
-   2,
-   CRITERION::CRITERION_END},  // CRITERION_END uses the default criterion (GINI for classification,
-                               // MSE for regression)
-  {4, 2, 5, 1.0f, 1.0f, 4, 7, -1, true, 4, 2, 2, 0.0, 2, CRITERION::CRITERION_END}};
-
-typedef RfConcatTestReg<float, float> RfRegressorConcatTestF;
-TEST_P(RfRegressorConcatTestF, Convert_Reg) { testRegressor(); }
-
-INSTANTIATE_TEST_CASE_P(RfRegressorConcatTests,
-                        RfRegressorConcatTestF,
-                        ::testing::ValuesIn(inputsf2_reg));
-}  // end namespace ML
diff --git a/cpp/test/sg/rproj_test.cu b/cpp/test/sg/rproj_test.cu
index fc77722668..dc6ad9518b 100644
--- a/cpp/test/sg/rproj_test.cu
+++ b/cpp/test/sg/rproj_test.cu
@@ -22,7 +22,7 @@
 #include <cuml/metrics/metrics.hpp>
 #include <iostream>
 #include <raft/cuda_utils.cuh>
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <random>
 #include <vector>
 
diff --git a/cpp/test/sg/umap_parametrizable_test.cu b/cpp/test/sg/umap_parametrizable_test.cu
index 101c60193b..7eafe6a897 100644
--- a/cpp/test/sg/umap_parametrizable_test.cu
+++ b/cpp/test/sg/umap_parametrizable_test.cu
@@ -33,7 +33,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/handle.hpp>
 #include <raft/mr/device/allocator.hpp>
 #include <selection/knn.cuh>
@@ -109,6 +109,7 @@ class UMAPParametrizableTest : public ::testing::Test {
     bool fit_transform;
     bool supervised;
     bool knn_params;
+    bool refine;
     int n_samples;
     int n_features;
     int n_clusters;
@@ -183,6 +184,20 @@ class UMAPParametrizableTest : public ::testing::Test {
                     &umap_params,
                     model_embedding);
     }
+
+    if (test_params.refine) {
+      std::cout << "using refine";
+      if (test_params.supervised) {
+        auto cgraph_coo = ML::UMAP::get_graph(handle, X, y, n_samples, n_features, &umap_params);
+        ML::UMAP::refine(
+          handle, X, n_samples, n_features, cgraph_coo.get(), &umap_params, model_embedding);
+      } else {
+        auto cgraph_coo =
+          ML::UMAP::get_graph(handle, X, nullptr, n_samples, n_features, &umap_params);
+        ML::UMAP::refine(
+          handle, X, n_samples, n_features, cgraph_coo.get(), &umap_params, model_embedding);
+      }
+    }
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
     if (!test_params.fit_transform) {
@@ -249,10 +264,10 @@ class UMAPParametrizableTest : public ::testing::Test {
               << umap_params.random_state << std::endl;
 
     std::cout << "test_params : [" << std::boolalpha << test_params.fit_transform << "-"
-              << test_params.supervised << "-" << test_params.knn_params << "-"
-              << test_params.n_samples << "-" << test_params.n_features << "-"
-              << test_params.n_clusters << "-" << test_params.min_trustworthiness << "]"
-              << std::endl;
+              << test_params.supervised << "-" << test_params.refine << "-"
+              << test_params.knn_params << "-" << test_params.n_samples << "-"
+              << test_params.n_features << "-" << test_params.n_clusters << "-"
+              << test_params.min_trustworthiness << "]" << std::endl;
 
     raft::handle_t handle;
     cudaStream_t stream = handle.get_stream();
@@ -289,6 +304,13 @@ class UMAPParametrizableTest : public ::testing::Test {
 
     float* e1 = embeddings1.data();
 
+#if CUDART_VERSION >= 11020
+    // Always use random init w/ CUDA 11.2. For some reason the
+    // spectral solver doesn't always converge w/ this CUDA version.
+    umap_params.init         = 0;
+    umap_params.random_state = 43;
+    umap_params.n_epochs     = 500;
+#endif
     get_embedding(handle, X_d.data(), (float*)y_d.data(), e1, test_params, umap_params);
 
     assertions(handle, X_d.data(), e1, test_params, umap_params);
@@ -323,14 +345,14 @@ class UMAPParametrizableTest : public ::testing::Test {
 
   void SetUp() override
   {
-    std::vector<TestParams> test_params_vec = {{false, false, false, 2000, 50, 20, 0.45},
-                                               {true, false, false, 2000, 50, 20, 0.45},
-                                               {false, true, false, 2000, 50, 20, 0.45},
-                                               {false, false, true, 2000, 50, 20, 0.45},
-                                               {true, true, false, 2000, 50, 20, 0.45},
-                                               {true, false, true, 2000, 50, 20, 0.45},
-                                               {false, true, true, 2000, 50, 20, 0.45},
-                                               {true, true, true, 2000, 50, 20, 0.45}};
+    std::vector<TestParams> test_params_vec = {{false, false, false, true, 2000, 50, 20, 0.45},
+                                               {true, false, false, false, 2000, 50, 20, 0.45},
+                                               {false, true, false, true, 2000, 50, 20, 0.45},
+                                               {false, false, true, false, 2000, 50, 20, 0.45},
+                                               {true, true, false, true, 2000, 50, 20, 0.45},
+                                               {true, false, true, false, 2000, 50, 20, 0.45},
+                                               {false, true, true, true, 2000, 50, 20, 0.45},
+                                               {true, true, true, false, 2000, 50, 20, 0.45}};
 
     std::vector<UMAPParams> umap_params_vec(4);
     umap_params_vec[0].n_components = 2;
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 6874b72f34..76af89fa2d 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -63,7 +63,7 @@ Preprocessing, Metrics, and Utilities
 Model Selection and Data Splitting
 ----------------------------------
 
- .. autofunction:: cuml.preprocessing.model_selection.train_test_split
+ .. autofunction:: cuml.model_selection.train_test_split
 
 Feature and Label Encoding (Single-GPU)
 ---------------------------------------
@@ -270,6 +270,9 @@ Naive Bayes
 .. autoclass:: cuml.naive_bayes.GaussianNB
     :members:
 
+.. autoclass:: cuml.naive_bayes.CategoricalNB
+    :members:
+
 Stochastic Gradient Descent
 ---------------------------
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index faf0bbea70..8f5a6f7bbd 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -79,9 +79,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '21.10'
+version = '21.12'
 # The full version, including alpha/beta/rc tags.
-release = '21.10.00'
+release = '21.12.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/notebooks/arima_demo.ipynb b/notebooks/arima_demo.ipynb
index 32d4285f36..d230ee955d 100644
--- a/notebooks/arima_demo.ipynb
+++ b/notebooks/arima_demo.ipynb
@@ -37,7 +37,8 @@
     "\n",
     "import numpy as np\n",
     "import pandas as pd\n",
-    "import matplotlib.pyplot as plt"
+    "import matplotlib.pyplot as plt\n",
+    "import random"
    ]
   },
   {
@@ -109,13 +110,14 @@
     "    \n",
     "    # Range for the prediction\n",
     "    if pred is not None:\n",
-    "        pred_start = pred_start or n_obs\n",
+    "        pred_start = n_obs if pred_start is None else pred_start\n",
     "        pred_end = pred_start + pred.shape[0]\n",
+    "    else:\n",
+    "        pred_end = n_obs\n",
     "    \n",
     "    # Plot the data\n",
     "    for i in range(batch_size):\n",
     "        title = y.columns[i]\n",
-    "        ax[i].plot(np.r_[:n_obs], y[title].to_array(), color=col[0])\n",
     "        if pred is not None:\n",
     "            ax[i].plot(np.r_[pred_start:pred_end],\n",
     "                       pred[pred.columns[i]].to_array(),\n",
@@ -126,10 +128,13 @@
     "                               lower[lower.columns[i]].to_array(),\n",
     "                               upper[upper.columns[i]].to_array(),\n",
     "                               alpha=0.2, color=col[1])\n",
+    "        ax[i].plot(np.r_[:n_obs], y[title].to_array(), color=col[0])\n",
     "        ax[i].title.set_text(title)\n",
+    "        ax[i].set_xlim((0, pred_end))\n",
     "    for i in range(batch_size, r*c):\n",
     "        fig.delaxes(ax[i])\n",
     "    fig.tight_layout()\n",
+    "    fig.patch.set_facecolor('white')\n",
     "    plt.show()"
    ]
   },
@@ -175,7 +180,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model_mig = ARIMA(df_mig, (0,0,2), fit_intercept=True)\n",
+    "model_mig = ARIMA(df_mig, order=(0,0,2), fit_intercept=True)\n",
     "model_mig.fit()"
    ]
   },
@@ -268,7 +273,7 @@
     "df_pop = load_dataset(\"population_estimate\")\n",
     "\n",
     "# Fit an ARIMA(1,2,1) model\n",
-    "model_pop = ARIMA(df_pop, (1,2,1), fit_intercept=True)\n",
+    "model_pop = ARIMA(df_pop, order=(1,2,1), fit_intercept=True)\n",
     "model_pop.fit()\n",
     "\n",
     "# Predict in-sample and forecast out-of-sample\n",
@@ -321,7 +326,8 @@
     "df_guests = load_dataset(\"guest_nights_by_region\", 4)\n",
     "\n",
     "# Create and fit an ARIMA(1,1,1)(1,1,1)12 model:\n",
-    "model_guests = ARIMA(df_guests, (1,1,1), (1,1,1,12), fit_intercept=False)\n",
+    "model_guests = ARIMA(df_guests, order=(1,1,1), seasonal_order=(1,1,1,12),\n",
+    "                     fit_intercept=False)\n",
     "model_guests.fit()"
    ]
   },
@@ -337,6 +343,90 @@
     "# Visualize after the time step 200\n",
     "visualize(df_guests[200:], fc_guests)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Missing observations\n",
+    "\n",
+    "ARIMA supports missing observations in the data. You can also pad your dataset at the start in order to batch computations even if the series have different lengths.\n",
+    "\n",
+    "To illustrate that, let's create a fake dataset from the seasonal dataset used above. We will simulate series of different lengths and add missing observations:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cut dataset to 100 observations\n",
+    "df_guests_missing = df_guests[:100].copy()\n",
+    "\n",
+    "for title in df_guests_missing.columns:\n",
+    "    # Missing observations at the start to simulate varying lengths\n",
+    "    n_leading = random.randint(5, 40)\n",
+    "    df_guests_missing[title][:n_leading]=None\n",
+    "    \n",
+    "    # Random missing observations in the middle\n",
+    "    missing_obs = random.choices(range(n_leading, 100), k=random.randint(5, 20))\n",
+    "    df_guests_missing[title][missing_obs]=None"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that missing observations need to be represented by the value `NaN` to convert the dataset to a numeric array. `NA`s in dataframes can be filled with:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_guests_missing = df_guests_missing.fillna(np.nan)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can now fit a model. Here we will do in- and out-of-sample predictions, to showcase how this model can fill the gaps in data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create and fit an ARIMA(1,1,1)(1,1,1)12 model:\n",
+    "model_guests_missing = ARIMA(df_guests_missing, order=(1,1,1), seasonal_order=(1,1,1,12),\n",
+    "                             fit_intercept=False)\n",
+    "model_guests_missing.fit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Forecast\n",
+    "fc_guests_missing = model_guests_missing.predict(0, 120)\n",
+    "\n",
+    "visualize(df_guests_missing, fc_guests_missing, 0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that the model can't form predictions at the start where we padded with missing values. The first in-sample predictions will be equal to a constant value (0 in the absence of intercept)."
+   ]
   }
  ],
  "metadata": {
@@ -356,7 +446,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.8"
+   "version": "3.8.10"
   },
   "mimetype": "text/x-python",
   "name": "python",
diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx
index f2baf10a2b..f05fa71e1b 100644
--- a/python/cuml/cluster/dbscan.pyx
+++ b/python/cuml/cluster/dbscan.pyx
@@ -251,6 +251,10 @@ class DBSCAN(Base,
             input_to_cuml_array(X, order='C',
                                 check_dtype=[np.float32, np.float64])
 
+        if n_rows == 0:
+            raise ValueError("No rows in the input array. DBScan cannot be "
+                             "fitted!")
+
         cdef uintptr_t input_ptr = X_m.ptr
 
         cdef handle_t* handle_ = <handle_t*><size_t>self.handle.getHandle()
diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py
index 692d9e3a0e..39596a2823 100755
--- a/python/cuml/dask/ensemble/randomforestclassifier.py
+++ b/python/cuml/dask/ensemble/randomforestclassifier.py
@@ -74,16 +74,15 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin,
         run different models concurrently in different streams by creating
         handles in several streams.
         If it is None, a new one is created.
-    split_criterion : The criterion used to split nodes.
-        0 for GINI, 1 for ENTROPY, 4 for CRITERION_END.
-        2 and 3 not valid for classification
-        (default = 0)
-    split_algo : 0 for HIST and 1 for GLOBAL_QUANTILE (default = 1)
-        the algorithm to determine how nodes are split in the tree.
-    split_criterion : The criterion used to split nodes.
-        0 for GINI, 1 for ENTROPY, 4 for CRITERION_END.
-        2 and 3 not valid for classification
-        (default = 0)
+    split_criterion : int or string (default = 0 ('gini'))
+        The criterion used to split nodes.
+        0 or 'gini' for GINI, 1 or 'entropy' for ENTROPY,
+        2 or 'mse' for MSE,
+        4 or 'poisson' for POISSON,
+        5 or 'gamma' for GAMMA,
+        6 or 'inverse_gaussian' for INVERSE_GAUSSIAN,
+        2, 'mse', 4, 'poisson', 5, 'gamma', 6, 'inverse_gaussian' not valid
+        for classification
     bootstrap : boolean (default = True)
         Control bootstrapping.
         If set, each tree in the forest is built
@@ -112,17 +111,6 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin,
         If float, then min_samples_split represents a fraction and
         ceil(min_samples_split * n_rows) is the minimum number of samples
         for each split.
-    quantile_per_tree : boolean (default = False)
-        Whether quantile is computed for individual RF trees.
-        Only relevant for GLOBAL_QUANTILE split_algo.
-    use_experimental_backend : boolean (default = True)
-        If set to true and the following conditions are also met, a new
-        experimental backend for decision tree training will be used. The
-        new backend is available only if `split_algo = 1` (GLOBAL_QUANTILE)
-        and `quantile_per_tree = False` (No per tree quantile computation).
-        The new backend is considered stable for classification tasks but
-        not yet for regression tasks. The RAPIDS team is continuing
-        optimization and evaluation of the new backend for regression tasks.
     n_streams : int (default = 4 )
         Number of parallel streams used for forest building
     workers : optional, list of strings
diff --git a/python/cuml/dask/ensemble/randomforestregressor.py b/python/cuml/dask/ensemble/randomforestregressor.py
index 3b21810fb4..328484d6ae 100755
--- a/python/cuml/dask/ensemble/randomforestregressor.py
+++ b/python/cuml/dask/ensemble/randomforestregressor.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
 from cuml.dask.common.base import DelayedPredictionMixin
 from cuml.ensemble import RandomForestRegressor as cuRFR
 from cuml.dask.ensemble.base import \
@@ -68,14 +67,14 @@ class RandomForestRegressor(BaseRandomForestModel, DelayedPredictionMixin,
         run different models concurrently in different streams by creating
         handles in several streams.
         If it is None, a new one is created.
-    split_algo : int (default = 1)
-        0 for HIST, 1 for GLOBAL_QUANTILE
-        The type of algorithm to be used to create the trees.
-    split_criterion : int (default = 2)
+    split_criterion : int or string (default = 2 ('mse'))
         The criterion used to split nodes.
-        0 for GINI, 1 for ENTROPY,
-        2 for MSE, 3 for MAE and 4 for CRITERION_END.
-        0 and 1 not valid for regression
+        0 or 'gini' for GINI, 1 or 'entropy' for ENTROPY,
+        2 or 'mse' for MSE,
+        4 or 'poisson' for POISSON,
+        5 or 'gamma' for GAMMA,
+        6 or 'inverse_gaussian' for INVERSE_GAUSSIAN,
+        0, 'gini', 1, 'entropy' not valid for regression
     bootstrap : boolean (default = True)
         Control bootstrapping.
         If set, each tree in the forest is built
@@ -118,17 +117,6 @@ class RandomForestRegressor(BaseRandomForestModel, DelayedPredictionMixin,
         for median of abs error : 'median_ae'
         for mean of abs error : 'mean_ae'
         for mean square error' : 'mse'
-    quantile_per_tree : boolean (default = False)
-        Whether quantile is computed for individual RF trees.
-        Only relevant for GLOBAL_QUANTILE split_algo.
-    use_experimental_backend : boolean (default = False)
-        If set to true and the following conditions are also met, a new
-        experimental backend for decision tree training will be used. The
-        new backend is available only if `split_algo = 1` (GLOBAL_QUANTILE)
-        and `quantile_per_tree = False` (No per tree quantile computation).
-        The new backend is considered stable for classification tasks but
-        not yet for regression tasks. The RAPIDS team is continuing
-        optimization and evaluation of the new backend for regression tasks.
     n_streams : int (default = 4 )
         Number of parallel streams used for forest building
     workers : optional, list of strings
diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index ee0f1d7f0b..7e7a6b1dc8 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
 import ctypes
 import cupy as cp
 import math
@@ -54,8 +53,15 @@ class BaseRandomForestModel(Base):
                     'random_state', 'warm_start', 'class_weight',
                     'criterion']
 
-    criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE,
-                      '3': MAE, '4': CRITERION_END}
+    criterion_dict = {'0': GINI, 'gini': GINI,
+                      '1': ENTROPY, 'entropy': ENTROPY,
+                      '2': MSE, 'mse': MSE,
+                      '3': MAE, 'mae': MAE,
+                      '4': POISSON, 'poisson': POISSON,
+                      '5': GAMMA, 'gamma': GAMMA,
+                      '6': INVERSE_GAUSSIAN,
+                      'inverse_gaussian': INVERSE_GAUSSIAN,
+                      '7': CRITERION_END}
 
     classes_ = CumlArrayDescriptor()
 
@@ -104,14 +110,6 @@ class BaseRandomForestModel(Base):
                           "recommended. If n_streams is > 1, results may vary "
                           "due to stream/thread timing differences, even when "
                           "random_state is set")
-        if 'use_experimental_backend' in kwargs.keys():
-            warnings.warn("The 'use_experimental_backend' parameter is "
-                          "deprecated and has no effect. "
-                          "It will be removed in 21.10 release.")
-        if 'split_algo' in kwargs.keys():
-            warnings.warn("The 'split_algo' parameter is "
-                          "deprecated and has no effect. "
-                          "It will be removed in 21.10 release.")
         if handle is None:
             handle = Handle(n_streams)
 
@@ -221,15 +219,15 @@ class BaseRandomForestModel(Base):
                     &tl_handle,
                     <RandomForestMetaData[float, int]*>
                     <uintptr_t> self.rf_forest,
-                    <int> self.n_cols,
-                    <int> self.num_classes)
+                    <int> self.n_cols
+                    )
             else:
                 build_treelite_forest(
                     &tl_handle,
                     <RandomForestMetaData[float, float]*>
                     <uintptr_t> self.rf_forest,
-                    <int> self.n_cols,
-                    <int> REGRESSION_MODEL)
+                    <int> self.n_cols
+                    )
 
         self.treelite_handle = <uintptr_t> tl_handle
         return self.treelite_handle
@@ -247,8 +245,10 @@ class BaseRandomForestModel(Base):
             input_to_cuml_array(X, check_dtype=[np.float32, np.float64],
                                 order='F')
         if self.n_bins > self.n_rows:
-            raise ValueError("The number of bins,`n_bins` can not be greater"
-                             " than the number of samples used for training.")
+            warnings.warn("The number of bins, `n_bins` is greater than "
+                          "the number of samples used for training. "
+                          "Changing `n_bins` to number of training samples.")
+            self.n_bins = self.n_rows
 
         if self.RF_type == CLASSIFICATION:
             y_m, _, _, y_dtype = \
@@ -329,14 +329,14 @@ class BaseRandomForestModel(Base):
                                 check_cols=self.n_cols)
 
         if dtype == np.float64 and not convert_dtype:
-            raise TypeError("GPU based predict only accepts np.float32 data. \
-                            Please set convert_dtype=True to convert the test \
-                            data to the same dtype as the data used to train, \
-                            ie. np.float32. If you would like to use test \
-                            data of dtype=np.float64 please set \
-                            predict_model='CPU' to use the CPU implementation \
-                            of predict.")
-
+            warnings.warn("GPU based predict only accepts "
+                          "np.float32 data. The model was "
+                          "trained on np.float64 data hence "
+                          "cannot use GPU-based prediction! "
+                          "\nDefaulting to CPU-based Prediction. "
+                          "\nTo predict on float-64 data, set "
+                          "parameter predict_model = 'CPU'")
+            return self._predict_model_on_cpu(X, convert_dtype=convert_dtype)
         treelite_handle = self._obtain_treelite_handle()
 
         storage_type = \
@@ -365,6 +365,7 @@ class BaseRandomForestModel(Base):
         self.treelite_serialized_model = None
 
         super().set_params(**params)
+        return self
 
 
 def _check_fil_parameter_validity(depth, algo, fil_sparse_format):
diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd
index 9e3c23fb4f..638b1d7a10 100644
--- a/python/cuml/ensemble/randomforest_shared.pxd
+++ b/python/cuml/ensemble/randomforest_shared.pxd
@@ -42,6 +42,9 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
         ENTROPY,
         MSE,
         MAE,
+        POISSON,
+        GAMMA,
+        INVERSE_GAUSSIAN,
         CRITERION_END
 
 cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
@@ -77,8 +80,8 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
     #
     cdef void build_treelite_forest[T, L](ModelHandle*,
                                           RandomForestMetaData[T, L]*,
-                                          int,
-                                          int) except +
+                                          int
+                                          ) except +
 
     cdef void delete_rf_metadata[T, L](RandomForestMetaData[T, L]*) except +
 
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index f68bee6088..e6006ddf55 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -16,7 +16,6 @@
 #
 
 # distutils: language = c++
-
 import numpy as np
 import rmm
 import warnings
@@ -176,13 +175,11 @@ class RandomForestClassifier(BaseRandomForestModel,
     -----------
     n_estimators : int (default = 100)
         Number of trees in the forest. (Default changed to 100 in cuML 0.11)
-    split_criterion : The criterion used to split nodes.
-        0 for GINI, 1 for ENTROPY
-        2 and 3 not valid for classification
-        (default = 0)
-    split_algo : int (default = 1)
-        Deprecated and currrently has no effect.
-        .. deprecated:: 21.06
+    split_criterion : int or string (default = 0 ('gini'))
+        The criterion used to split nodes.
+        0 or 'gini' for GINI, 1 or 'entropy' for ENTROPY,
+        2 or 'mse' for MSE
+        2 or 'mse' not valid for classification
     bootstrap : boolean (default = True)
         Control bootstrapping.
         If True, each tree in the forest is built
@@ -226,9 +223,6 @@ class RandomForestClassifier(BaseRandomForestModel,
     min_impurity_decrease : float (default = 0.0)
         Minimum decrease in impurity requried for
         node to be spilt.
-    use_experimental_backend : boolean (default = True)
-        Deprecated and currrently has no effect.
-        .. deprecated:: 21.08
     max_batch_size: int (default = 4096)
         Maximum number of nodes that can be processed in a given batch.
     random_state : int (default = None)
@@ -559,8 +553,7 @@ class RandomForestClassifier(BaseRandomForestModel,
     @insert_into_docstring(parameters=[('dense', '(n_samples, n_features)')],
                            return_values=[('dense', '(n_samples, 1)')])
     def predict(self, X, predict_model="GPU", threshold=0.5,
-                algo='auto', num_classes=None,
-                convert_dtype=True,
+                algo='auto', convert_dtype=True,
                 fil_sparse_format='auto') -> CumlArray:
         """
         Predicts the labels for X.
@@ -589,13 +582,6 @@ class RandomForestClassifier(BaseRandomForestModel,
         threshold : float (default = 0.5)
             Threshold used for classification. Optional and required only
             while performing the predict operation on the GPU.
-        num_classes : int (default = None)
-            number of different classes present in the dataset.
-
-            .. deprecated:: 0.16
-                Parameter 'num_classes' is deprecated and will be removed in
-                an upcoming version. The number of classes passed must match
-                the number of classes the model was trained on.
 
         convert_dtype : bool, optional (default = True)
             When set to True, the predict method will, when necessary, convert
@@ -617,24 +603,19 @@ class RandomForestClassifier(BaseRandomForestModel,
         y : {}
         """
         nvtx_range_push("predict RF-Classifier @randomforestclassifier.pyx")
-        if num_classes:
-            warnings.warn("num_classes is deprecated and will be removed"
-                          " in an upcoming version")
-            if num_classes != self.num_classes:
-                raise NotImplementedError("limiting num_classes for predict"
-                                          " is not implemented")
         if predict_model == "CPU":
             preds = self._predict_model_on_cpu(X,
                                                convert_dtype=convert_dtype)
-
         elif self.dtype == np.float64:
-            raise TypeError("GPU based predict only accepts np.float32 data. \
-                            In order use the GPU predict the model should \
-                            also be trained using a np.float32 dataset. \
-                            If you would like to use np.float64 dtype \
-                            then please use the CPU based predict by \
-                            setting predict_model = 'CPU'")
-
+            warnings.warn("GPU based predict only accepts "
+                          "np.float32 data. The model was "
+                          "trained on np.float64 data hence "
+                          "cannot use GPU-based prediction! "
+                          "\nDefaulting to CPU-based Prediction. "
+                          "\nTo predict on float-64 data, set "
+                          "parameter predict_model = 'CPU'")
+            preds = self._predict_model_on_cpu(X,
+                                               convert_dtype=convert_dtype)
         else:
             preds = \
                 self._predict_model_on_gpu(X=X, output_class=True,
@@ -650,7 +631,7 @@ class RandomForestClassifier(BaseRandomForestModel,
     @insert_into_docstring(parameters=[('dense', '(n_samples, n_features)')],
                            return_values=[('dense', '(n_samples, 1)')])
     def predict_proba(self, X, algo='auto',
-                      num_classes=None, convert_dtype=True,
+                      convert_dtype=True,
                       fil_sparse_format='auto') -> CumlArray:
         """
         Predicts class probabilites for X. This function uses the GPU
@@ -673,14 +654,6 @@ class RandomForestClassifier(BaseRandomForestModel,
              * ``'batch_tree_reorg'`` is used for dense storage
                and 'naive' for sparse storage
 
-        num_classes : int (default = None)
-            number of different classes present in the dataset.
-
-            .. deprecated:: 0.16
-                Parameter 'num_classes' is deprecated and will be removed in
-                an upcoming version. The number of classes passed must match
-                the number of classes the model was trained on.
-
         convert_dtype : bool, optional (default = True)
             When set to True, the predict method will, when necessary, convert
             the input to the data type which was used to train the model. This
@@ -708,15 +681,6 @@ class RandomForestClassifier(BaseRandomForestModel,
                             then please use the CPU based predict by \
                             setting predict_model = 'CPU'")
 
-        if num_classes:
-            warnings.warn("num_classes is deprecated and will be removed"
-                          " in an upcoming version")
-            if num_classes != self.num_classes:
-                raise NotImplementedError("The number of classes in the test "
-                                          "dataset should be equal to the "
-                                          "number of classes present in the "
-                                          "training dataset.")
-
         preds_proba = \
             self._predict_model_on_gpu(X, output_class=True,
                                        algo=algo,
@@ -729,7 +693,7 @@ class RandomForestClassifier(BaseRandomForestModel,
     @insert_into_docstring(parameters=[('dense', '(n_samples, n_features)'),
                                        ('dense_intdtype', '(n_samples, 1)')])
     def score(self, X, y, threshold=0.5,
-              algo='auto', num_classes=None, predict_model="GPU",
+              algo='auto', predict_model="GPU",
               convert_dtype=True, fil_sparse_format='auto'):
         """
         Calculates the accuracy metric score of the model for X.
@@ -755,13 +719,6 @@ class RandomForestClassifier(BaseRandomForestModel,
             threshold is used to for classification
             This is optional and required only while performing the
             predict operation on the GPU.
-        num_classes : int (default = None)
-            number of different classes present in the dataset.
-
-            .. deprecated:: 0.16
-                Parameter 'num_classes' is deprecated and will be removed in
-                an upcoming version. The number of classes passed must match
-                the number of classes the model was trained on.
 
         convert_dtype : boolean, default=True
             whether to convert input data to correct dtype automatically
@@ -803,7 +760,6 @@ class RandomForestClassifier(BaseRandomForestModel,
                              threshold=threshold, algo=algo,
                              convert_dtype=convert_dtype,
                              predict_model=predict_model,
-                             num_classes=num_classes,
                              fil_sparse_format=fil_sparse_format)
 
         cdef uintptr_t preds_ptr
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index c96ff64eb6..0ede37da66 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -161,22 +161,14 @@ class RandomForestRegressor(BaseRandomForestModel,
     -----------
     n_estimators : int (default = 100)
         Number of trees in the forest. (Default changed to 100 in cuML 0.11)
-    split_algo : int (default = 1)
-        The algorithm to determine how nodes are split in the tree.
-        Can be changed only for the old backend [deprecated].
-        0 for HIST and 1 for GLOBAL_QUANTILE. Default is GLOBAL_QUANTILE.
-        The default backend does not support HIST.
-        HIST currently uses a slower tree-building algorithm so
-        GLOBAL_QUANTILE is recommended for most cases.
-
-        .. deprecated:: 21.06
-           Parameter 'split_algo' is deprecated and will be removed in
-           subsequent release.
-    split_criterion : int (default = 2)
+    split_criterion : int or string (default = 2 ('mse'))
         The criterion used to split nodes.
-        0 for GINI, 1 for ENTROPY,
-        2 for MSE
-        0 and 1 not valid for regression
+        0 or 'gini' for GINI, 1 or 'entropy' for ENTROPY,
+        2 or 'mse' for MSE,
+        4 or 'poisson' for POISSON,
+        5 or 'gamma' for GAMMA,
+        6 or 'inverse_gaussian' for INVERSE_GAUSSIAN,
+        0, 'gini', 1, 'entropy' not valid for regression.
     bootstrap : boolean (default = True)
         Control bootstrapping.
         If True, each tree in the forest is built
@@ -228,9 +220,6 @@ class RandomForestRegressor(BaseRandomForestModel,
         for median of abs error : 'median_ae'
         for mean of abs error : 'mean_ae'
         for mean square error' : 'mse'
-    use_experimental_backend : boolean (default = True)
-        Deprecated and currrently has no effect.
-        .. deprecated:: 21.08
     max_batch_size: int (default = 4096)
         Maximum number of nodes that can be processed in a given batch.
     random_state : int (default = None)
@@ -586,15 +575,16 @@ class RandomForestRegressor(BaseRandomForestModel,
         nvtx_range_push("predict RF-Regressor @randomforestregressor.pyx")
         if predict_model == "CPU":
             preds = self._predict_model_on_cpu(X, convert_dtype)
-
         elif self.dtype == np.float64:
-            raise TypeError("GPU based predict only accepts np.float32 data. \
-                            In order use the GPU predict the model should \
-                            also be trained using a np.float32 dataset. \
-                            If you would like to use np.float64 dtype \
-                            then please use the CPU based predict by \
-                            setting predict_model = 'CPU'")
-
+            warnings.warn("GPU based predict only accepts "
+                          "np.float32 data. The model was "
+                          "trained on np.float64 data hence "
+                          "cannot use GPU-based prediction! "
+                          "\nDefaulting to CPU-based Prediction. "
+                          "\nTo predict on float-64 data, set "
+                          "parameter predict_model = 'CPU'")
+            preds = self._predict_model_on_cpu(X,
+                                               convert_dtype=convert_dtype)
         else:
             preds = self._predict_model_on_gpu(
                 X=X,
diff --git a/python/cuml/explainer/kernel_shap.pyx b/python/cuml/explainer/kernel_shap.pyx
index 6a2281e76e..eb312775e8 100644
--- a/python/cuml/explainer/kernel_shap.pyx
+++ b/python/cuml/explainer/kernel_shap.pyx
@@ -645,8 +645,13 @@ def _weighted_linear_regression(X,
         # from nonzero_inds and some additional arrays
         # nonzero_inds tells us which cols of X to use
         y = y - X[:, nonzero_inds[-1]] * (fx - expected_value)
-        Xw = cp.transpose(
-            cp.transpose(X[:, nonzero_inds[:-1]]) - X[:, nonzero_inds[-1]])
+        if len(nonzero_inds) == 1:
+            # when only one index is nonzero, use that column
+            Xw = X[:, nonzero_inds]
+        else:
+            Xw = cp.transpose(
+                    cp.transpose(
+                        X[:, nonzero_inds[:-1]]) - X[:, nonzero_inds[-1]])
 
         Xw = Xw * cp.sqrt(weights[:, cp.newaxis])
         y = y * cp.sqrt(weights)
diff --git a/python/cuml/explainer/sampling.py b/python/cuml/explainer/sampling.py
index 70297a55e2..ed8f43218b 100644
--- a/python/cuml/explainer/sampling.py
+++ b/python/cuml/explainer/sampling.py
@@ -12,11 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import cudf
 import cupy as cp
-import numpy as np
-import pandas as pd
-from numba import cuda
 from scipy.sparse import issparse
 
 import cuml
@@ -65,24 +61,15 @@ def kmeans_sampling(X, k, round_values=True, detailed=False, random_state=0):
                         dtypes: cuDF DataFrame, cuDF Series, cupy, numba,\
                         numpy, pandas DataFrame, pandas Series")
 
-    if output_dtype == cudf.DataFrame:
+    if "DataFrame" in str(output_dtype):
         group_names = X.columns
-        X = X.values
-    elif output_dtype == cudf.Series:
+        X = cp.array(X.values, copy=False)
+    if "Series" in str(output_dtype):
         group_names = X.name
-        X = X.values.reshape(-1, 1)
-    elif output_dtype == pd.DataFrame:
-        group_names = X.columns
-        X = cp.array(X.values)
-    elif output_dtype == pd.Series:
-        group_names = X.name
-        X = cp.array(X.values.reshape(-1, 1))
+        X = cp.array(X.values.reshape(-1, 1), copy=False)
     else:
         # it's either numpy, cupy or numba
-        if output_dtype == cuda.devicearray.DeviceNDArrayBase:
-            X = cp.array(X)
-        elif output_dtype == np.ndarray:
-            X = cp.array(X)
+        X = cp.array(X, copy=False)
         try:
             # more than one column
             group_names = [str(i) for i in range(X.shape[1])]
diff --git a/python/cuml/feature_extraction/_tfidf_vectorizer.py b/python/cuml/feature_extraction/_tfidf_vectorizer.py
index e2cc4158b6..fbeeba7fc2 100644
--- a/python/cuml/feature_extraction/_tfidf_vectorizer.py
+++ b/python/cuml/feature_extraction/_tfidf_vectorizer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -256,3 +256,13 @@ def transform(self, raw_documents):
         """
         X = super().transform(raw_documents)
         return self._tfidf.transform(X, copy=False)
+
+    def get_feature_names(self):
+        """
+        Array mapping from feature integer indices to feature name.
+        Returns
+        -------
+        feature_names : Series
+            A list of feature names.
+        """
+        return super().get_feature_names()
diff --git a/python/cuml/linear_model/linear_regression.pyx b/python/cuml/linear_model/linear_regression.pyx
index a6a72292d4..a4023e9307 100644
--- a/python/cuml/linear_model/linear_regression.pyx
+++ b/python/cuml/linear_model/linear_regression.pyx
@@ -28,6 +28,7 @@ from libcpp cimport bool
 from libc.stdint cimport uintptr_t
 from libc.stdlib cimport calloc, malloc, free
 
+from cuml import Handle
 from cuml.common.array import CumlArray
 from cuml.common.array_descriptor import CumlArrayDescriptor
 from cuml.common.base import Base
@@ -128,10 +129,22 @@ class LinearRegression(Base,
 
     Parameters
     -----------
-    algorithm : 'eig' or 'svd' (default = 'eig')
-        Eig uses a eigendecomposition of the covariance matrix, and is much
-        faster.
-        SVD is slower, but guaranteed to be stable.
+    algorithm : {'svd', 'eig', `qr`, 'svd-qr', 'svd-jacobi'}, (default = 'eig')
+        Choose an algorithm:
+
+          * 'svd' - alias for svd-jacobi;
+          * 'eig' - use an eigendecomposition of the covariance matrix;
+          * 'qr'  - use QR decomposition algorithm and solve `Rx = Q^T y`
+          * 'svd-qr' - compute SVD decomposition using QR algorithm
+          * 'svd-jacobi' - compute SVD decomposition using Jacobi iterations.
+
+        Among these algorithms, only 'svd-jacobi' supports the case when the
+        number of features is larger than the sample size; this algorithm
+        is force-selected automatically in such a case.
+
+        For the broad range of inputs, 'eig' and `qr` are usually the fastest,
+        followed by 'svd-jacobi' and then 'svd-qr'. In theory, SVD-based
+        algorithms are more stable.
     fit_intercept : boolean (default = True)
         If True, LinearRegression tries to correct for the global mean of y.
         If False, the model expects that you have centered the data.
@@ -193,6 +206,10 @@ class LinearRegression(Base,
 
     def __init__(self, *, algorithm='eig', fit_intercept=True, normalize=False,
                  handle=None, verbose=False, output_type=None):
+        if handle is None and algorithm == 'eig':
+            # if possible, create two streams, so that eigenvalue decomposition
+            # can benefit from running independent operations concurrently.
+            handle = Handle(2)
         super().__init__(handle=handle,
                          verbose=verbose,
                          output_type=output_type)
@@ -203,7 +220,7 @@ class LinearRegression(Base,
 
         self.fit_intercept = fit_intercept
         self.normalize = normalize
-        if algorithm in ['svd', 'eig']:
+        if algorithm in ['svd', 'eig', 'qr', 'svd-qr', 'svd-jacobi']:
             self.algorithm = algorithm
             self.algo = self._get_algorithm_int(algorithm)
         else:
@@ -215,7 +232,10 @@ class LinearRegression(Base,
     def _get_algorithm_int(self, algorithm):
         return {
             'svd': 0,
-            'eig': 1
+            'eig': 1,
+            'qr': 2,
+            'svd-qr': 3,
+            'svd-jacobi': 0
         }[algorithm]
 
     @generate_docstring()
diff --git a/python/cuml/metrics/distance_type.pxd b/python/cuml/metrics/distance_type.pxd
index 93cf1ad9e9..4286ea1c9d 100644
--- a/python/cuml/metrics/distance_type.pxd
+++ b/python/cuml/metrics/distance_type.pxd
@@ -33,5 +33,8 @@ cdef extern from "raft/linalg/distance_type.h" namespace "raft::distance":
         Haversine "raft::distance::DistanceType::Haversine"
         BrayCurtis "raft::distance::DistanceType::BrayCurtis"
         JensenShannon "raft::distance::DistanceType::JensenShannon"
+        HammingUnexpanded "raft::distance::DistanceType::HammingUnexpanded"
+        KLDivergence "raft::distance::DistanceType::KLDivergence"
+        RusselRaoExpanded "raft::distance::DistanceType::RusselRaoExpanded"
         DiceExpanded "raft::distance::DistanceType::DiceExpanded"
         Precomputed "raft::distance::DistanceType::Precomputed"
diff --git a/python/cuml/metrics/pairwise_distances.pyx b/python/cuml/metrics/pairwise_distances.pyx
index d43fcb4329..29f9d492ac 100644
--- a/python/cuml/metrics/pairwise_distances.pyx
+++ b/python/cuml/metrics/pairwise_distances.pyx
@@ -73,7 +73,12 @@ PAIRWISE_DISTANCE_METRICS = {
     "canberra": DistanceType.Canberra,
     "chebyshev": DistanceType.Linf,
     "minkowski": DistanceType.LpUnexpanded,
-    "hellinger": DistanceType.HellingerExpanded
+    "hellinger": DistanceType.HellingerExpanded,
+    "correlation": DistanceType.CorrelationExpanded,
+    "jensenshannon": DistanceType.JensenShannon,
+    "hamming": DistanceType.HammingUnexpanded,
+    "kldivergence": DistanceType.KLDivergence,
+    "russellrao": DistanceType.RusselRaoExpanded
 }
 
 PAIRWISE_DISTANCE_SPARSE_METRICS = {
@@ -217,6 +222,11 @@ def pairwise_distances(X, Y=None, metric="euclidean", handle=None,
     handle = Handle() if handle is None else handle
     cdef handle_t *handle_ = <handle_t*> <size_t> handle.getHandle()
 
+    if metric in ['russellrao'] and not np.all(X.data == 1.):
+        warnings.warn("X was converted to boolean for metric {}"
+                      .format(metric))
+        X = np.where(X != 0., 1.0, 0.0)
+
     # Get the input arrays, preserve order and type where possible
     X_m, n_samples_x, n_features_x, dtype_x = \
         input_to_cuml_array(X, order="K", check_dtype=[np.float32, np.float64])
@@ -235,12 +245,16 @@ def pairwise_distances(X, Y=None, metric="euclidean", handle=None,
         if (n_samples_x == 1 or n_features_x == 1):
             input_order = "K"
 
+        if metric in ['russellrao'] and not np.all(Y.data == 1.):
+            warnings.warn("Y was converted to boolean for metric {}"
+                          .format(metric))
+            Y = np.where(Y != 0., 1.0, 0.0)
+
         Y_m, n_samples_y, n_features_y, dtype_y = \
             input_to_cuml_array(Y, order=input_order,
                                 convert_to_dtype=(dtype_x if convert_dtype
                                                   else None),
                                 check_dtype=[dtype_x])
-
         # Get the order from Y if necessary (It's possible to set order="F" in
         # input_to_cuml_array and have Y_m.order=="C")
         if (input_order == "K"):
diff --git a/python/cuml/metrics/regression.pyx b/python/cuml/metrics/regression.pyx
index daf5775f88..d6d56a09cb 100644
--- a/python/cuml/metrics/regression.pyx
+++ b/python/cuml/metrics/regression.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -104,13 +104,15 @@ def _prepare_input_reg(y_true, y_pred, sample_weight, multioutput):
     Helper function to avoid code duplication for regression metrics.
     Converts inputs to CumlArray and check multioutput parameter validity.
     """
+    y_true = y_true.squeeze() if len(y_true) > 1 else y_true
     y_true, n_rows, n_cols, ytype = \
         input_to_cuml_array(y_true, check_dtype=[np.float32, np.float64,
                                                  np.int32, np.int64])
 
+    y_pred = y_pred.squeeze() if len(y_pred) > 1 else y_pred
     y_pred, _, _, _ = \
-        input_to_cuml_array(y_pred, check_dtype=ytype,
-                            check_rows=n_rows, check_cols=n_cols)
+        input_to_cuml_array(y_pred, check_dtype=ytype, check_rows=n_rows,
+                            check_cols=n_cols)
 
     if sample_weight is not None:
         sample_weight, _, _, _ = \
diff --git a/python/cuml/metrics/trustworthiness.pyx b/python/cuml/metrics/trustworthiness.pyx
index 1dee80aced..e1086ed2db 100644
--- a/python/cuml/metrics/trustworthiness.pyx
+++ b/python/cuml/metrics/trustworthiness.pyx
@@ -83,6 +83,9 @@ def trustworthiness(X, X_embedded, handle=None, n_neighbors=5,
             Trustworthiness of the low-dimensional embedding
     """
 
+    if n_neighbors > X.shape[0]:
+        raise ValueError("n_neighbors must be <= the number of rows.")
+
     handle = cuml.raft.common.handle.Handle() if handle is None else handle
 
     cdef uintptr_t d_X_ptr
diff --git a/python/cuml/naive_bayes/__init__.py b/python/cuml/naive_bayes/__init__.py
index 73d7b01a46..9e08a5f971 100644
--- a/python/cuml/naive_bayes/__init__.py
+++ b/python/cuml/naive_bayes/__init__.py
@@ -17,3 +17,4 @@
 from cuml.naive_bayes.naive_bayes import MultinomialNB
 from cuml.naive_bayes.naive_bayes import BernoulliNB
 from cuml.naive_bayes.naive_bayes import GaussianNB
+from cuml.naive_bayes.naive_bayes import CategoricalNB
diff --git a/python/cuml/naive_bayes/naive_bayes.py b/python/cuml/naive_bayes/naive_bayes.py
index d74d6a6642..ce33c828df 100644
--- a/python/cuml/naive_bayes/naive_bayes.py
+++ b/python/cuml/naive_bayes/naive_bayes.py
@@ -58,13 +58,13 @@ def count_features_coo_kernel(float_dtype, int_dtype):
       int row = rows[i];
       int col = cols[i];
       {0} val = vals[i];
-
+      {1} label = labels[row];
+      unsigned out_idx = (col * n_classes) + label;
       if(has_weights)
         val *= weights[i];
 
       if(square) val *= val;
-      {1} label = labels[row];
-      atomicAdd(out + ((col * n_classes) + label), val);
+      atomicAdd(out + out_idx, val);
     }'''
 
     return cuda_kernel_factory(kernel_str, (float_dtype, int_dtype),
@@ -97,7 +97,8 @@ def count_features_dense_kernel(float_dtype, int_dtype):
      bool has_weights,
      int n_classes,
      bool square,
-     bool rowMajor) {
+     bool rowMajor,
+     bool categorical) {
 
       int row = blockIdx.x * blockDim.x + threadIdx.x;
       int col = blockIdx.y * blockDim.y + threadIdx.y;
@@ -106,16 +107,22 @@ def count_features_dense_kernel(float_dtype, int_dtype):
 
       {0} val = !rowMajor ?
             in[col * n_rows + row] : in[row * n_cols + col];
+      {1} label = labels[row];
+      unsigned out_idx = ((col * n_classes) + label);
 
+      if (categorical)
+      {
+        out_idx = (val * n_classes * n_cols) + (label * n_cols) + col;
+        val = 1;
+      }
       if(has_weights)
         val *= weights[row];
 
       if(val == 0.0) return;
 
       if(square) val *= val;
-      {1} label = labels[row];
 
-      atomicAdd(out + ((col * n_classes) + label), val);
+      atomicAdd(out + out_idx, val);
     }'''
 
     return cuda_kernel_factory(kernel_str, (float_dtype, int_dtype),
@@ -498,7 +505,7 @@ def _update_mean_variance(self, X, Y, sample_weight=None):
         if X.shape[0] == 0:
             return mu, var
 
-        # Make sure Y iclass_count_s cp array not CumlArray
+        # Make sure Y is cp array not CumlArray
         Y = cp.asarray(Y)
 
         new_mu = cp.zeros((self.n_classes_, self.n_features_), order="F",
@@ -558,7 +565,8 @@ def _update_mean_variance(self, X, Y, sample_weight=None):
                                   sample_weight.shape[0] > 0,
                                   self.n_classes_,
                                   False,
-                                  X.flags["C_CONTIGUOUS"]))
+                                  X.flags["C_CONTIGUOUS"],
+                                  False))
 
             # Run again for variance
             count_features_dense((math.ceil(n_rows / tpb),
@@ -573,7 +581,8 @@ def _update_mean_variance(self, X, Y, sample_weight=None):
                                   sample_weight.shape[0] > 0,
                                   self.n_classes_,
                                   True,
-                                  X.flags["C_CONTIGUOUS"]))
+                                  X.flags["C_CONTIGUOUS"],
+                                  False))
 
         count_classes = count_classes_kernel(X.dtype, labels_dtype)
         count_classes((math.ceil(n_rows / tpb),), (tpb,),
@@ -711,7 +720,7 @@ def partial_fit(self, X, y, classes=None,
 
         sample_weight : array-like of shape (n_samples)
                         Weights applied to individual samples (1. for
-                        unweighted). Currently sample weight is ignored
+                        unweighted). Currently sample weight is ignored.
 
         Returns
         -------
@@ -746,19 +755,19 @@ def _partial_fit(self, X, y, sample_weight=None,
                                                   if convert_dtype
                                                   else False),
                                 check_dtype=expected_y_dtype).array
-
-        Y, label_classes = make_monotonic(y, copy=True)
+        if _classes is not None:
+            _classes, *_ = input_to_cuml_array(_classes, order='K',
+                                               convert_to_dtype=(
+                                                   expected_y_dtype
+                                                   if convert_dtype
+                                                   else False))
+        Y, label_classes = make_monotonic(y, classes=_classes, copy=True)
 
         X, Y = self._check_X_y(X, Y)
 
         if not self.fit_called_:
             self.fit_called_ = True
             if _classes is not None:
-                _classes, *_ = input_to_cuml_array(_classes, order='K',
-                                                   convert_to_dtype=(
-                                                       expected_y_dtype
-                                                       if convert_dtype
-                                                       else False))
                 check_labels(Y, _classes.to_output('cupy'))
                 self.classes_ = _classes
             else:
@@ -795,7 +804,9 @@ def fit(self, X, y, sample_weight=None) -> "_BaseDiscreteNB":
         y : array-like shape (n_samples) Target values.
         sample_weight : array-like of shape (n_samples)
             Weights applied to individial samples (1. for unweighted).
+            Currently sample weight is ignored.
         """
+        self.fit_called_ = False
         return self.partial_fit(X, y, sample_weight)
 
     def _init_counters(self, n_effective_classes, n_features, dtype):
@@ -864,7 +875,8 @@ def _count(self, X, Y, classes):
              sample_weight.shape[0] > 0,
              n_classes,
              False,
-             X.flags["C_CONTIGUOUS"]))
+             X.flags["C_CONTIGUOUS"],
+             False))
 
         tpb = 256
         count_classes = count_classes_kernel(X.dtype, labels_dtype)
@@ -980,6 +992,22 @@ class MultinomialNB(_BaseDiscreteNB):
         Sets logging level. It must be one of `cuml.common.logger.level_*`.
         See :ref:`verbosity-levels` for more info.
 
+    Attributes
+    ----------
+    class_count_ : ndarray of shape (n_classes)
+        Number of samples encountered for each class during fitting.
+    class_log_prior_ : ndarray of shape (n_classes)
+        Log probability of each class (smoothed).
+    classes_ : ndarray of shape (n_classes,)
+        Class labels known to the classifier
+    feature_count_ : ndarray of shape (n_classes, n_features)
+        Number of samples encountered for each (class, feature)
+        during fitting.
+    feature_log_prob_ : ndarray of shape (n_classes, n_features)
+        Empirical log probability of features given a class, P(x_i|y).
+    n_features_ : int
+        Number of features of each sample.
+
     Examples
     --------
 
@@ -1117,16 +1145,14 @@ class BernoulliNB(_BaseDiscreteNB):
     Attributes
     ----------
     class_count_ : ndarray of shape (n_classes)
-        Number of samples encountered for each class during fitting. This
-        value is weighted by the sample weight when provided.
+        Number of samples encountered for each class during fitting.
     class_log_prior_ : ndarray of shape (n_classes)
         Log probability of each class (smoothed).
     classes_ : ndarray of shape (n_classes,)
         Class labels known to the classifier
     feature_count_ : ndarray of shape (n_classes, n_features)
         Number of samples encountered for each (class, feature)
-        during fitting. This value is weighted by the sample weight when
-        provided.
+        during fitting.
     feature_log_prob_ : ndarray of shape (n_classes, n_features)
         Empirical log probability of features given a class, P(x_i|y).
     n_features_ : int
@@ -1136,7 +1162,7 @@ class BernoulliNB(_BaseDiscreteNB):
     --------
     >>> import cupy as cp
     >>> rng = cp.random.RandomState(1)
-    >>> X = rng.randint(5, size=(6, 100))
+    >>> X = rng.randint(5, size=(6, 100), dtype=cp.int32)
     >>> Y = cp.array([1, 2, 3, 4, 4, 5])
     >>> from cuml.naive_bayes import BernoulliNB
     >>> clf = BernoulliNB()
@@ -1224,3 +1250,385 @@ def get_param_names(self):
                 "binarize",
                 "fit_prior",
             ]
+
+
+class CategoricalNB(_BaseDiscreteNB):
+    """
+    Naive Bayes classifier for categorical features
+    The categorical Naive Bayes classifier is suitable for classification with
+    discrete features that are categorically distributed. The categories of
+    each feature are drawn from a categorical distribution.
+
+    Parameters
+    ----------
+    alpha : float, default=1.0
+        Additive (Laplace/Lidstone) smoothing parameter
+        (0 for no smoothing).
+    fit_prior : bool, default=True
+        Whether to learn class prior probabilities or not.
+        If false, a uniform prior will be used.
+    class_prior : array-like of shape (n_classes,), default=None
+        Prior probabilities of the classes. If specified the priors are not
+        adjusted according to the data.
+    output_type : {'input', 'cudf', 'cupy', 'numpy', 'numba'}, default=None
+        Variable to control output type of the results and attributes of
+        the estimator. If None, it'll inherit the output type set at the
+        module level, `cuml.global_settings.output_type`.
+        See :ref:`output-data-type-configuration` for more info.
+    handle : cuml.Handle
+        Specifies the cuml.handle that holds internal CUDA state for
+        computations in this model. Most importantly, this specifies the
+        CUDA stream that will be used for the model's computations, so
+        users can run different models concurrently in different streams
+        by creating handles in several streams.
+        If it is None, a new one is created.
+    verbose : int or boolean, default=False
+        Sets logging level. It must be one of `cuml.common.logger.level_*`.
+        See :ref:`verbosity-levels` for more info.
+
+    Attributes
+    ----------
+    category_count_ : ndarray of shape (n_features, n_classes, n_categories)
+        With n_categories being the highest category of all the features.
+        This array provides the number of samples encountered for each feature,
+        class and category of the specific feature.
+    class_count_ : ndarray of shape (n_classes,)
+        Number of samples encountered for each class during fitting.
+    class_log_prior_ : ndarray of shape (n_classes,)
+        Smoothed empirical log probability for each class.
+    classes_ : ndarray of shape (n_classes,)
+        Class labels known to the classifier
+    feature_log_prob_ : ndarray of shape (n_features, n_classes, n_categories)
+        With n_categories being the highest category of all the features.
+        Each array of shape (n_classes, n_categories) provides the empirical
+        log probability of categories given the respective feature
+        and class, ``P(x_i|y)``.
+        This attribute is not available when the model has been trained with
+        sparse data.
+    n_features_ : int
+        Number of features of each sample.
+
+    Examples
+    --------
+    >>> import cupy as cp
+    >>> rng = cp.random.RandomState(1)
+    >>> X = rng.randint(5, size=(6, 100), dtype=cp.int32)
+    >>> y = cp.array([1, 2, 3, 4, 5, 6])
+    >>> from cuml.naive_bayes import CategoricalNB
+    >>> clf = CategoricalNB()
+    >>> clf.fit(X, y)
+    CategoricalNB()
+    >>> print(clf.predict(X[2:3]))
+    [3]
+    """
+    def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None,
+                 output_type=None, handle=None, verbose=False):
+        super(CategoricalNB, self).__init__(class_prior=class_prior,
+                                            handle=handle,
+                                            output_type=output_type,
+                                            verbose=verbose)
+        self.alpha = alpha
+        self.fit_prior = fit_prior
+
+    def _check_X_y(self, X, y):
+        if cp.sparse.isspmatrix(X):
+            warnings.warn("X dtype is not int32. X will be "
+                          "converted, which will increase memory consumption")
+            X.data = X.data.astype(cp.int32)
+            x_min = X.data.min()
+        else:
+            if X.dtype not in [cp.int32]:
+                warnings.warn("X dtype is not int32. X will be "
+                              "converted, which will increase memory "
+                              "consumption")
+                X = input_to_cupy_array(X, order='K',
+                                        convert_to_dtype=cp.int32).array
+            x_min = X.min()
+        if x_min < 0:
+            raise ValueError("Negative values in data passed to CategoricalNB")
+        return X, y
+
+    def _check_X(self, X):
+        if cp.sparse.isspmatrix(X):
+            warnings.warn("X dtype is not int32. X will be "
+                          "converted, which will increase memory consumption")
+            X.data = X.data.astype(cp.int32)
+            x_min = X.data.min()
+        else:
+            if X.dtype not in [cp.int32]:
+                warnings.warn("X dtype is not int32. X will be "
+                              "converted, which will increase memory "
+                              "consumption")
+                X = input_to_cupy_array(X, order='K',
+                                        convert_to_dtype=cp.int32).array
+            x_min = X.min()
+        if x_min < 0:
+            raise ValueError("Negative values in data passed to CategoricalNB")
+        return X
+
+    def fit(self, X, y, sample_weight=None) -> "CategoricalNB":
+        """Fit Naive Bayes classifier according to X, y
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features. Here, each feature of X is
+            assumed to be from a different categorical distribution.
+            It is further assumed that all categories of each feature are
+            represented by the numbers 0, ..., n - 1, where n refers to the
+            total number of categories for the given feature. This can, for
+            instance, be achieved with the help of OrdinalEncoder.
+        y : array-like of shape (n_samples,)
+            Target values.
+        sample_weight : array-like of shape (n_samples), default=None
+            Weights applied to individual samples (1. for unweighted).
+            Currently sample weight is ignored.
+        Returns
+        -------
+        self : object
+        """
+        return super().fit(X, y, sample_weight=sample_weight)
+
+    def partial_fit(self, X, y, classes=None,
+                    sample_weight=None) -> "CategoricalNB":
+        """Incremental fit on a batch of samples.
+        This method is expected to be called several times consecutively
+        on different chunks of a dataset so as to implement out-of-core
+        or online learning.
+        This is especially useful when the whole dataset is too big to fit in
+        memory at once.
+        This method has some performance overhead hence it is better to call
+        partial_fit on chunks of data that are as large as possible
+        (as long as fitting in the memory budget) to hide the overhead.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features. Here, each feature of X is
+            assumed to be from a different categorical distribution.
+            It is further assumed that all categories of each feature are
+            represented by the numbers 0, ..., n - 1, where n refers to the
+            total number of categories for the given feature. This can, for
+            instance, be achieved with the help of OrdinalEncoder.
+        y : array-like of shape (n_samples)
+            Target values.
+        classes : array-like of shape (n_classes), default=None
+            List of all the classes that can possibly appear in the y vector.
+            Must be provided at the first call to partial_fit, can be omitted
+            in subsequent calls.
+        sample_weight : array-like of shape (n_samples), default=None
+            Weights applied to individual samples (1. for unweighted).
+            Currently sample weight is ignored.
+        Returns
+        -------
+        self : object
+        """
+        return super().partial_fit(X, y, classes,
+                                   sample_weight=sample_weight)
+
+    def _count_sparse(self, x_coo_rows, x_coo_cols, x_coo_data, x_shape, Y,
+                      classes):
+        """
+        Sum feature counts & class prior counts and add to current model.
+        Parameters
+        ----------
+        x_coo_rows : cupy.ndarray of size (nnz)
+        x_coo_cols : cupy.ndarray of size (nnz)
+        x_coo_data : cupy.ndarray of size (nnz)
+        Y : cupy.array of monotonic class labels
+        """
+        n_classes = classes.shape[0]
+        n_rows = x_shape[0]
+        n_cols = x_shape[1]
+        x_coo_nnz = x_coo_rows.shape[0]
+        labels_dtype = classes.dtype
+        tpb = 256
+
+        if Y.dtype != classes.dtype:
+            warnings.warn("Y dtype does not match classes_ dtype. Y will be "
+                          "converted, which will increase memory consumption")
+
+        # Make sure Y is a cupy array, not CumlArray
+        Y = cp.asarray(Y)
+
+        class_c = cp.zeros(n_classes, dtype=self.class_count_.dtype)
+        count_classes = count_classes_kernel(self.class_count_.dtype,
+                                             labels_dtype)
+        count_classes((math.ceil(n_rows / tpb), ), (tpb, ),
+                      (class_c, n_rows, Y))
+
+        highest_feature = int(x_coo_data.max()) + 1
+        feature_diff = highest_feature - self.category_count_.shape[1]
+        # In case of a partial fit, pad the array to have the highest feature
+        if not cp.sparse.issparse(self.category_count_):
+            self.category_count_ = cupyx.scipy.sparse.coo_matrix(
+                (self.n_features_ * n_classes, highest_feature))
+        elif feature_diff > 0:
+            self.category_count_ = cupyx.scipy.sparse.coo_matrix(
+                self.category_count_,
+                shape=(self.n_features_ * n_classes, highest_feature))
+        highest_feature = self.category_count_.shape[1]
+
+        count_features_coo = cp.ElementwiseKernel(
+            'int32 row, int32 col, int32 val, int32 nnz, int32 n_classes, \
+             int32 n_cols, raw T labels',
+            'int32 out_row, int32 out_col',
+            '''
+            T label = labels[row];
+            out_row = col + n_cols * label;
+            out_col = val;
+            ''',
+            'count_features_categorical_coo_kernel')
+        counts_rows, counts_cols = count_features_coo(x_coo_rows,
+                                                      x_coo_cols,
+                                                      x_coo_data,
+                                                      x_coo_nnz,
+                                                      n_classes,
+                                                      n_cols,
+                                                      Y)
+        # Create the sparse category count matrix from the result of
+        # the raw kernel
+        counts = cupyx.scipy.sparse.coo_matrix(
+            (cp.ones(x_coo_nnz), (counts_rows, counts_cols)),
+            shape=(self.n_features_ * n_classes, highest_feature)).tocsr()
+
+        # Adjust with the missing (zeros) data of the sparse matrix
+        for i in range(n_classes):
+            counts[i*n_cols:(i+1)*n_cols, 0] = \
+                (Y == i).sum() - counts[i*n_cols:(i+1)*n_cols].sum(1)
+        self.category_count_ = (self.category_count_ + counts).tocoo()
+        self.class_count_ = self.class_count_ + class_c
+
+    def _count(self, X, Y, classes):
+        Y = cp.asarray(Y)
+        tpb = 32
+        n_rows = X.shape[0]
+        n_cols = X.shape[1]
+        n_classes = classes.shape[0]
+        labels_dtype = classes.dtype
+
+        sample_weight = cp.zeros(0, dtype=X.dtype)
+        highest_feature = int(X.max()) + 1
+        feature_diff = highest_feature - self.category_count_.shape[2]
+        # In case of a partial fit, pad the array to have the highest feature
+        if feature_diff > 0:
+            self.category_count_ = cp.pad(self.category_count_,
+                                          [(0, 0), (0, 0), (0, feature_diff)],
+                                          'constant')
+        highest_feature = self.category_count_.shape[2]
+        counts = cp.zeros((self.n_features_, n_classes, highest_feature),
+                          order="F", dtype=X.dtype)
+
+        count_features = count_features_dense_kernel(X.dtype,
+                                                     Y.dtype)
+        count_features((math.ceil(n_rows / tpb), math.ceil(n_cols / tpb), 1),
+                       (tpb, tpb, 1),
+                       (counts,
+                        X,
+                        n_rows,
+                        n_cols,
+                        Y,
+                        sample_weight,
+                        sample_weight.shape[0] > 0,
+                        self.n_classes_,
+                        False,
+                        X.flags["C_CONTIGUOUS"],
+                        True))
+        self.category_count_ += counts
+
+        class_c = cp.zeros(n_classes, order="F", dtype=self.class_count_.dtype)
+        count_classes = count_classes_kernel(class_c.dtype, labels_dtype)
+        count_classes((math.ceil(n_rows / tpb),), (tpb,),
+                      (class_c, n_rows, Y))
+        self.class_count_ += class_c
+
+    def _init_counters(self, n_effective_classes, n_features, dtype):
+        self.class_count_ = cp.zeros(n_effective_classes,
+                                     order="F",
+                                     dtype=cp.float64)
+        self.category_count_ = cp.zeros((n_features, n_effective_classes, 0),
+                                        order="F",
+                                        dtype=dtype)
+
+    def _update_feature_log_prob(self, alpha):
+        highest_feature = cp.zeros(self.n_features_, dtype=cp.float64)
+        if cp.sparse.issparse(self.category_count_):
+            # For sparse data we avoid the creation of the dense matrix
+            # feature_log_prob_. This can be created on the fly during
+            # the prediction without using as much memory.
+            features = self.category_count_.row % self.n_features_
+            cupyx.scatter_max(highest_feature,
+                              features,
+                              self.category_count_.col)
+            highest_feature = (highest_feature + 1) * alpha
+
+            smoothed_class_count = self.category_count_.sum(axis=1)
+            smoothed_class_count = smoothed_class_count.reshape(
+                (self.n_classes_, self.n_features_)).T
+            smoothed_class_count += highest_feature[:, cp.newaxis]
+            smoothed_cat_count = \
+                cupyx.scipy.sparse.coo_matrix(self.category_count_)
+            smoothed_cat_count.data = cp.log(smoothed_cat_count.data + alpha)
+            self.smoothed_cat_count = smoothed_cat_count.tocsr()
+            self.smoothed_class_count = cp.log(smoothed_class_count)
+        else:
+            indices = self.category_count_.nonzero()
+            cupyx.scatter_max(highest_feature, indices[0], indices[2])
+            highest_feature = (highest_feature + 1) * alpha
+
+            smoothed_class_count = self.category_count_.sum(axis=2) + \
+                highest_feature[:, cp.newaxis]
+            smoothed_cat_count = self.category_count_ + alpha
+            self.feature_log_prob_ = cp.log(smoothed_cat_count) - \
+                cp.log(smoothed_class_count[:, :, cp.newaxis])
+
+    def _joint_log_likelihood(self, X):
+        if not X.shape[1] == self.n_features_:
+            raise ValueError("Expected input with %d features, got %d instead"
+                             % (self.n_features_, X.shape[1]))
+        n_rows = X.shape[0]
+        if cp.sparse.isspmatrix(X):
+            # For sparse data we assume that most categories will be zeros,
+            # so we first compute the jll for categories 0
+            features_zeros = self.smoothed_cat_count[:, 0].todense()
+            features_zeros = features_zeros.reshape(self.n_classes_,
+                                                    self.n_features_).T
+            if self.alpha != 1.0:
+                features_zeros[cp.where(features_zeros == 0)] += \
+                    cp.log(self.alpha)
+            features_zeros -= self.smoothed_class_count
+            features_zeros = features_zeros.sum(0)
+            jll = cp.repeat(features_zeros[cp.newaxis, :], n_rows, axis=0)
+
+            X = X.tocoo()
+            col_indices = X.col
+
+            # Adjust with the non-zeros data by adding jll_data (non-zeros)
+            # and substracting jll_zeros which are the zeros
+            # that were first computed
+            for i in range(self.n_classes_):
+                jll_data = self.smoothed_cat_count[
+                    col_indices + i * self.n_features_, X.data].ravel()
+                jll_zeros = self.smoothed_cat_count[
+                    col_indices + i * self.n_features_, 0].todense()[:, 0]
+                if self.alpha != 1.0:
+                    jll_data[cp.where(jll_data == 0)] += cp.log(self.alpha)
+                    jll_zeros[cp.where(jll_zeros == 0)] += cp.log(self.alpha)
+                jll_data -= jll_zeros
+                cupyx.scatter_add(jll[:, i], X.row, jll_data)
+
+        else:
+            col_indices = cp.indices(X.shape)[1].flatten()
+            jll = self.feature_log_prob_[col_indices, :, X.ravel()]
+            jll = jll.reshape((n_rows, self.n_features_, self.n_classes_))
+            jll = jll.sum(1)
+        jll += self.class_log_prior_
+        return jll
+
+    def get_param_names(self):
+        return super().get_param_names() + \
+            [
+                "alpha",
+                "class_prior",
+                "fit_prior"
+            ]
diff --git a/python/cuml/neighbors/__init__.py b/python/cuml/neighbors/__init__.py
index 68a301bfc7..2cece8b2f7 100644
--- a/python/cuml/neighbors/__init__.py
+++ b/python/cuml/neighbors/__init__.py
@@ -33,6 +33,10 @@
         "inner_product", "sqeuclidean",
         "haversine"
     ]),
+    "rbc": set([
+        "euclidean", "haversine",
+        "l2"
+    ]),
     "ivfflat": set([
         "l2", "euclidean", "sqeuclidean",
         "inner_product", "cosine", "correlation"
@@ -45,7 +49,7 @@
         "l2", "euclidean", "sqeuclidean",
         "inner_product", "cosine", "correlation"
     ])
-    }
+}
 
 VALID_METRICS_SPARSE = {
     "brute": set(["euclidean", "l2", "inner_product",
diff --git a/python/cuml/neighbors/nearest_neighbors.pyx b/python/cuml/neighbors/nearest_neighbors.pyx
index f9fc1527d1..d63ff3ba00 100644
--- a/python/cuml/neighbors/nearest_neighbors.pyx
+++ b/python/cuml/neighbors/nearest_neighbors.pyx
@@ -48,7 +48,7 @@ from cython.operator cimport dereference as deref
 from libcpp cimport bool
 from libcpp.memory cimport shared_ptr
 
-from libc.stdint cimport uintptr_t, int64_t
+from libc.stdint cimport uintptr_t, int64_t, uint32_t
 from libc.stdlib cimport calloc, malloc, free
 
 from libcpp.vector cimport vector
@@ -63,6 +63,16 @@ cimport cuml.common.cuda
 if has_scipy():
     import scipy.sparse
 
+
+cdef extern from "raft/spatial/knn/ball_cover_common.h" \
+        namespace "raft::spatial::knn":
+    cdef cppclass BallCoverIndex[int64_t, float, uint32_t]:
+        BallCoverIndex(const handle_t &handle,
+                       float *X,
+                       uint32_t n_rows,
+                       uint32_t n_cols,
+                       DistanceType metric) except +
+
 cdef extern from "cuml/neighbors/knn.hpp" namespace "ML":
     void brute_force_knn(
         const handle_t &handle,
@@ -80,6 +90,21 @@ cdef extern from "cuml/neighbors/knn.hpp" namespace "ML":
         float metric_arg
     ) except +
 
+    void rbc_build_index(
+        const handle_t &handle,
+        BallCoverIndex[int64_t, float, uint32_t] &index,
+    ) except +
+
+    void rbc_knn_query(
+        const handle_t &handle,
+        BallCoverIndex[int64_t, float, uint32_t] &index,
+        uint32_t k,
+        float *search_items,
+        uint32_t n_search_items,
+        int64_t *out_inds,
+        float *out_dists
+    ) except +
+
     void approx_knn_build_index(
         handle_t &handle,
         knnIndex* index,
@@ -101,6 +126,7 @@ cdef extern from "cuml/neighbors/knn.hpp" namespace "ML":
         int n
     ) except +
 
+
 cdef extern from "cuml/neighbors/knn_sparse.hpp" namespace "ML::Sparse":
     void brute_force_knn(handle_t &handle,
                          const int *idxIndptr,
@@ -148,6 +174,12 @@ class NearestNeighbors(Base,
     algorithm : string (default='brute')
         The query algorithm to use. Valid options are:
 
+        - ``'auto'``: to automatically select brute-force or
+          random ball cover based on data shape and metric
+        - ``'rbc'``: for the random ball algorithm, which partitions
+          the data space and uses the triangle inequality to lower the
+          number of potential distances. Currently, this algorithm
+          supports 2d Euclidean and Haversine.
         - ``'brute'``: for brute-force, slow but produces exact results
         - ``'ivfflat'``: for inverted file, divide the dataset in partitions
           and perform search on relevant partitions only
@@ -281,6 +313,10 @@ class NearestNeighbors(Base,
     the FAISS release that this cuML version is linked to.
     (see cuML issue #4020)
 
+    Warning: For compatibility with libraries that rely on scikit-learn,
+    kwargs allows for passing of arguments that are not explicit in the
+    class constructor, such as 'n_jobs', but they have no effect on behavior.
+
     For an additional example see `the NearestNeighbors notebook
     <https://github.com/rapidsai/cuml/blob/branch-0.15/notebooks/nearest_neighbors_demo.ipynb>`_.
 
@@ -295,12 +331,13 @@ class NearestNeighbors(Base,
                  n_neighbors=5,
                  verbose=False,
                  handle=None,
-                 algorithm="brute",
+                 algorithm="auto",
                  metric="euclidean",
                  p=2,
                  algo_params=None,
                  metric_params=None,
-                 output_type=None):
+                 output_type=None,
+                 **kwargs):
 
         super().__init__(handle=handle,
                          verbose=verbose,
@@ -313,8 +350,10 @@ class NearestNeighbors(Base,
         self.algo_params = algo_params
         self.p = p
         self.algorithm = algorithm
+        self.working_algorithm_ = self.algorithm
+        self.selected_algorithm_ = algorithm
         self.algo_params = algo_params
-        self.knn_index = <uintptr_t> 0
+        self.knn_index = None
 
     @generate_docstring(X='dense_sparse')
     def fit(self, X, convert_dtype=True) -> "NearestNeighbors":
@@ -327,29 +366,44 @@ class NearestNeighbors(Base,
 
         self.n_dims = X.shape[1]
 
+        if self.algorithm == "auto":
+            if self.n_dims == 2 and self.metric in \
+                    cuml.neighbors.VALID_METRICS["rbc"]:
+                self.working_algorithm_ = "rbc"
+            else:
+                self.working_algorithm_ = "brute"
+
+        if self.algorithm == "rbc" and self.n_dims > 2:
+            raise ValueError("rbc algorithm currently only supports 2d data")
+
         if is_sparse(X):
             valid_metrics = cuml.neighbors.VALID_METRICS_SPARSE
+            value_metric_str = "_SPARSE"
             self.X_m = SparseCumlArray(X, convert_to_dtype=cp.float32,
                                        convert_format=False)
             self.n_rows = self.X_m.shape[0]
 
         else:
             valid_metrics = cuml.neighbors.VALID_METRICS
+            valid_metric_str = ""
             self.X_m, self.n_rows, n_cols, dtype = \
                 input_to_cuml_array(X, order='C', check_dtype=np.float32,
                                     convert_to_dtype=(np.float32
                                                       if convert_dtype
                                                       else None))
 
-        if self.metric not in valid_metrics[self.algorithm]:
+        if self.metric not in \
+                valid_metrics[self.working_algorithm_]:
             raise ValueError("Metric %s is not valid. "
-                             "Use sorted(cuml.neighbors.VALID_METRICS[%s]) "
+                             "Use sorted(cuml.neighbors.VALID_METRICS%s[%s]) "
                              "to get valid options." %
-                             (self.metric, self.algorithm))
+                             (valid_metric_str,
+                              self.metric,
+                              self.working_algorithm_))
 
         cdef handle_t* handle_ = <handle_t*><uintptr_t> self.handle.getHandle()
         cdef knnIndexParam* algo_params = <knnIndexParam*> 0
-        if self.algorithm in ['ivfflat', 'ivfpq', 'ivfsq']:
+        if self.working_algorithm_ in ['ivfflat', 'ivfpq', 'ivfsq']:
             warnings.warn("\nWarning: Approximate Nearest Neighbor methods "
                           "might be unstable in this version of cuML. "
                           "This is due to a known issue in the FAISS "
@@ -365,7 +419,7 @@ class NearestNeighbors(Base,
             knn_index = new knnIndex()
             self.knn_index = <uintptr_t> knn_index
             algo_params = <knnIndexParam*><uintptr_t> \
-                build_algo_params(self.algorithm, self.algo_params,
+                build_algo_params(self.working_algorithm_, self.algo_params,
                                   additional_info)
             metric = self._build_metric_type(self.metric)
 
@@ -382,6 +436,16 @@ class NearestNeighbors(Base,
             destroy_algo_params(<uintptr_t>algo_params)
 
             del self.X_m
+        elif self.working_algorithm_ == "rbc":
+            metric = self._build_metric_type(self.metric)
+
+            rbc_index = new BallCoverIndex[int64_t, float, uint32_t](
+                handle_[0], <float*><uintptr_t>self.X_m.ptr,
+                <uint32_t>self.n_rows, <uint32_t>n_cols,
+                <DistanceType>metric)
+            rbc_build_index(handle_[0],
+                            deref(rbc_index))
+            self.knn_index = <uintptr_t>rbc_index
 
         self.n_indices = 1
         return self
@@ -649,8 +713,10 @@ class NearestNeighbors(Base,
         cdef vector[float*] *inputs = new vector[float*]()
         cdef vector[int] *sizes = new vector[int]()
         cdef knnIndex* knn_index = <knnIndex*> 0
+        cdef BallCoverIndex[int64_t, float, uint32_t]* rbc_index = \
+            <BallCoverIndex[int64_t, float, uint32_t]*> 0
 
-        if self.algorithm == 'brute':
+        if self.working_algorithm_ == 'brute':
             inputs.push_back(<float*><uintptr_t>self.X_m.ptr)
             sizes.push_back(<int>self.X_m.shape[0])
 
@@ -670,6 +736,16 @@ class NearestNeighbors(Base,
                 # minkowski order is currently the only metric argument.
                 <float>self.p
             )
+        elif self.working_algorithm_ == "rbc":
+            rbc_index = <BallCoverIndex[int64_t, float, uint32_t]*>\
+                <uintptr_t>self.knn_index
+            rbc_knn_query(handle_[0],
+                          deref(rbc_index),
+                          <uint32_t> n_neighbors,
+                          <float*><uintptr_t>X_m.ptr,
+                          <uint32_t> N,
+                          <int64_t*>I_ptr,
+                          <float*>D_ptr)
         else:
             knn_index = <knnIndex*><uintptr_t> self.knn_index
             approx_knn_search(
@@ -821,9 +897,15 @@ class NearestNeighbors(Base,
         return sparse_csr
 
     def __del__(self):
-        cdef knnIndex* knn_index = <knnIndex*><uintptr_t>self.knn_index
-        if knn_index:
-            del knn_index
+        cdef knnIndex* knn_index = <knnIndex*>0
+        cdef BallCoverIndex* rbc_index = <BallCoverIndex*>0
+        if self.knn_index is not None:
+            if self.working_algorithm_ in ["ivfflat", "ivfpq", "ivfsq"]:
+                knn_index = <knnIndex*><uintptr_t>self.knn_index
+                del knn_index
+            else:
+                rbc_index = <BallCoverIndex*><uintptr_t>self.knn_index
+                del rbc_index
 
 
 @cuml.internals.api_return_sparse_array()
diff --git a/python/cuml/svm/svm_base.pyx b/python/cuml/svm/svm_base.pyx
index 077111d04e..e0a361a73d 100644
--- a/python/cuml/svm/svm_base.pyx
+++ b/python/cuml/svm/svm_base.pyx
@@ -326,9 +326,9 @@ class SVMBase(Base,
     @cuml.internals.api_base_return_array_skipall
     def coef_(self):
         if self._c_kernel != LINEAR:
-            raise RuntimeError("coef_ is only available for linear kernels")
+            raise AttributeError("coef_ is only available for linear kernels")
         if self._model is None:
-            raise RuntimeError("Call fit before prediction")
+            raise AttributeError("Call fit before prediction")
         if self._internal_coef_ is None:
             self._internal_coef_ = self._calc_coef()
         # Call the base class to perform the output conversion
diff --git a/python/cuml/test/dask/test_random_forest.py b/python/cuml/test/dask/test_random_forest.py
index 7e818df05e..3e4e25df51 100644
--- a/python/cuml/test/dask/test_random_forest.py
+++ b/python/cuml/test/dask/test_random_forest.py
@@ -331,36 +331,24 @@ def test_rf_concatenation_dask(client, model_type):
         assert local_tl.num_trees == n_estimators
 
 
-@pytest.mark.parametrize('model_type', ['classification', 'regression'])
 @pytest.mark.parametrize('ignore_empty_partitions', [True, False])
-def test_single_input(client, model_type, ignore_empty_partitions):
+def test_single_input_regression(client, ignore_empty_partitions):
     X, y = make_classification(n_samples=1, n_classes=1)
     X = X.astype(np.float32)
-    if model_type == 'classification':
-        y = y.astype(np.int32)
-    else:
-        y = y.astype(np.float32)
+    y = y.astype(np.float32)
 
     X, y = _prep_training_data(client, X, y,
                                partitions_per_worker=2)
-    if model_type == 'classification':
-        cu_rf_mg = cuRFC_mg(n_bins=1,
-                            ignore_empty_partitions=ignore_empty_partitions)
-    else:
-        cu_rf_mg = cuRFR_mg(n_bins=1,
-                            ignore_empty_partitions=ignore_empty_partitions)
+    cu_rf_mg = cuRFR_mg(n_bins=1,
+                        ignore_empty_partitions=ignore_empty_partitions)
 
     if ignore_empty_partitions or \
        len(client.scheduler_info()['workers'].keys()) == 1:
         cu_rf_mg.fit(X, y)
         cuml_mod_predict = cu_rf_mg.predict(X)
         cuml_mod_predict = cp.asnumpy(cp.array(cuml_mod_predict.compute()))
-
         y = cp.asnumpy(cp.array(y.compute()))
-
-        acc_score = accuracy_score(cuml_mod_predict, y)
-
-        assert acc_score == 1.0
+        assert y[0] == cuml_mod_predict[0]
 
     else:
         with pytest.raises(ValueError):
@@ -410,32 +398,30 @@ def test_rf_get_json(client, estimator_type, max_depth, n_estimators):
 
     # Test 3: Traverse JSON trees and get the same predictions as cuML RF
     def predict_with_json_tree(tree, x):
-        if 'children' not in tree:
-            assert 'leaf_value' in tree
-            return tree['leaf_value']
-        assert 'split_feature' in tree
-        assert 'split_threshold' in tree
-        assert 'yes' in tree
-        assert 'no' in tree
-        if x[tree['split_feature']] <= tree['split_threshold'] + 1e-5:
-            return predict_with_json_tree(tree['children'][0], x)
-        return predict_with_json_tree(tree['children'][1], x)
+        if "children" not in tree:
+            assert "leaf_value" in tree
+            return tree["leaf_value"]
+        assert "split_feature" in tree
+        assert "split_threshold" in tree
+        assert "yes" in tree
+        assert "no" in tree
+        if x[tree["split_feature"]] <= tree["split_threshold"] + 1e-5:
+            return predict_with_json_tree(tree["children"][0], x)
+        return predict_with_json_tree(tree["children"][1], x)
 
     def predict_with_json_rf_classifier(rf, x):
         # Returns the class with the highest vote. If there is a tie, return
         # the list of all classes with the highest vote.
-        vote = []
+        predictions = []
         for tree in rf:
-            vote.append(predict_with_json_tree(tree, x))
-        vote = np.bincount(vote)
-        max_vote = np.max(vote)
-        majority_vote = np.nonzero(np.equal(vote, max_vote))[0]
-        return majority_vote
+            predictions.append(np.array(predict_with_json_tree(tree, x)))
+        predictions = np.sum(predictions, axis=0)
+        return np.argmax(predictions)
 
     def predict_with_json_rf_regressor(rf, x):
-        pred = 0.
+        pred = 0.0
         for tree in rf:
-            pred += predict_with_json_tree(tree, x)
+            pred += predict_with_json_tree(tree, x)[0]
         return pred / len(rf)
 
     if estimator_type == 'classification':
@@ -443,7 +429,7 @@ def predict_with_json_rf_regressor(rf, x):
         expected_pred = expected_pred.compute().to_array()
         for idx, row in enumerate(X):
             majority_vote = predict_with_json_rf_classifier(json_obj, row)
-            assert expected_pred[idx] in majority_vote
+            assert expected_pred[idx] == majority_vote
     elif estimator_type == 'regression':
         expected_pred = cu_rf_mg.predict(X_dask).astype(np.float32)
         expected_pred = expected_pred.compute().to_array()
diff --git a/python/cuml/test/explainer/test_explainer_kernel_shap.py b/python/cuml/test/explainer/test_explainer_kernel_shap.py
index 219801f384..ec6e6cce18 100644
--- a/python/cuml/test/explainer/test_explainer_kernel_shap.py
+++ b/python/cuml/test/explainer/test_explainer_kernel_shap.py
@@ -21,9 +21,11 @@
 import pytest
 import sklearn.neighbors
 
+from cuml import Lasso
 from cuml import KernelExplainer
 from cuml.common.import_utils import has_scipy
 from cuml.common.import_utils import has_shap
+from cuml.datasets import make_regression
 from cuml.test.conftest import create_synthetic_dataset
 from cuml.test.utils import ClassEnumerator
 from cuml.test.utils import get_shap_values
@@ -322,6 +324,18 @@ def test_l1_regularization(exact_shap_regression_dataset, l1_type):
     assert isinstance(nz, cp.ndarray)
 
 
+def test_typeerror_input():
+    X, y = make_regression(n_samples=100, n_features=10, random_state=10)
+    clf = Lasso()
+    clf.fit(X, y)
+    exp = KernelExplainer(model=clf.predict, data=X, nsamples=10)
+    try:
+        _ = exp.shap_values(X)
+        assert True
+    except TypeError:
+        assert False
+
+
 ###############################################################################
 #                                 Precomputed results                         #
 #                               and testing variables                         #
diff --git a/python/cuml/test/test_arima.py b/python/cuml/test/test_arima.py
index a80c653efc..c6246cf87b 100644
--- a/python/cuml/test/test_arima.py
+++ b/python/cuml/test/test_arima.py
@@ -20,31 +20,30 @@
 #
 # This test file contains some unit tests and an integration test.
 #
-# The integration test has a wider tolerance margin, set separately for each
-# dataset. These margins have been found empirically when creating the
-# datasets. They will help to identify regressions.
+# The units tests use the same parameters with cuML and the reference
+# implementation to compare strict parity of specific components.
 #
-# The units tests use some ground truth (e.g the parameters found by the
-# reference implementation) to test a unique piece of code. The error margin
-# is then very small.
+# The integration tests compare that, when fitting and forecasting separately,
+# our implementation performs better or approximately as good as the reference
+# (it mostly serves to test that we don't have any regression)
 #
-# Note: when using an intercept, in certain cases our model and the reference
-# will converge to slightly different parameters. It is not an issue, but these
-# cases need to be removed for the tests
+# Note that there are significant differences between our implementation and
+# the reference, and perfect parity cannot be expected for integration tests.
 
 import pytest
 
-from collections import namedtuple
 import numpy as np
 import os
 import warnings
 
 import pandas as pd
 from scipy.optimize.optimize import approx_fprime
+from sklearn.model_selection import train_test_split
 import statsmodels.api as sm
 
 import cudf
 import cuml.tsa.arima as arima
+from cuml.common.input_utils import input_to_host_array
 
 from cuml.test.utils import stress_param
 
@@ -53,87 +52,98 @@
 #                                  Test data                                  #
 ###############################################################################
 
-# Common structure to hold the data, the reference and the testing parameters
-ARIMAData = namedtuple('ARIMAData', ['batch_size', 'n_obs', 'dataset', 'start',
-                                     'end', 'tolerance_integration'])
+class ARIMAData:
+    """Contains a dataset name and associated metadata
+    """
+    def __init__(self, batch_size, n_obs, n_test, dataset,
+                 tolerance_integration):
+        self.batch_size = batch_size
+        self.n_obs = n_obs
+        self.n_test = n_test
+        self.dataset = dataset
+        self.tolerance_integration = tolerance_integration
+
+        self.n_train = n_obs - n_test
+
 
 # ARIMA(1,0,1) with intercept
 test_101c = ARIMAData(
     batch_size=8,
     n_obs=15,
+    n_test=2,
     dataset="long_term_arrivals_by_citizenship",
-    start=10,
-    end=25,
-    tolerance_integration=0.06
+    tolerance_integration=0.01
 )
 
 # ARIMA(0,0,2) with intercept
 test_002c = ARIMAData(
     batch_size=7,
     n_obs=20,
+    n_test=2,
     dataset="net_migrations_auckland_by_age",
-    start=15,
-    end=30,
-    tolerance_integration=0.15
+    tolerance_integration=0.05
 )
 
 # ARIMA(0,1,0) with intercept
 test_010c = ARIMAData(
     batch_size=4,
     n_obs=17,
+    n_test=2,
     dataset="cattle",
-    start=10,
-    end=25,
-    tolerance_integration=0.001
+    tolerance_integration=0.01
 )
 
 # ARIMA(1,1,0)
 test_110 = ARIMAData(
     batch_size=1,
     n_obs=137,
+    n_test=5,
     dataset="police_recorded_crime",
-    start=100,
-    end=150,
-    tolerance_integration=0.001
+    tolerance_integration=0.01
 )
 
 # ARIMA(0,1,1) with intercept
 test_011c = ARIMAData(
     batch_size=16,
     n_obs=28,
+    n_test=2,
     dataset="deaths_by_region",
-    start=20,
-    end=40,
-    tolerance_integration=0.007
+    tolerance_integration=0.05
 )
 
 # ARIMA(1,2,1) with intercept
 test_121c = ARIMAData(
     batch_size=2,
     n_obs=137,
+    n_test=10,
     dataset="population_estimate",
-    start=100,
-    end=150,
-    tolerance_integration=0.05
+    tolerance_integration=0.01
+)
+
+# ARIMA(1,1,1) with intercept (missing observations)
+test_111c_missing = ARIMAData(
+    batch_size=2,
+    n_obs=137,
+    n_test=10,
+    dataset="population_estimate_missing",
+    tolerance_integration=0.01
 )
 
 # ARIMA(1,0,1)(1,1,1)_4
 test_101_111_4 = ARIMAData(
     batch_size=3,
     n_obs=101,
+    n_test=10,
     dataset="alcohol",
-    start=80,
-    end=110,
-    tolerance_integration=0.02
+    tolerance_integration=0.01
 )
 
 # ARIMA(5,1,0)
 test_510 = ARIMAData(
     batch_size=3,
     n_obs=101,
+    n_test=10,
     dataset="alcohol",
-    start=80,
-    end=110,
     tolerance_integration=0.02
 )
 
@@ -141,19 +151,26 @@
 test_111_200_4c = ARIMAData(
     batch_size=14,
     n_obs=123,
+    n_test=10,
     dataset="hourly_earnings_by_industry",
-    start=115,
-    end=130,
-    tolerance_integration=0.05
+    tolerance_integration=0.01
+)
+
+# ARIMA(1,1,1)(2,0,0)_4 with intercept (missing observations)
+test_111_200_4c_missing = ARIMAData(
+    batch_size=14,
+    n_obs=123,
+    n_test=10,
+    dataset="hourly_earnings_by_industry_missing",
+    tolerance_integration=0.01
 )
 
 # ARIMA(1,1,2)(0,1,2)_4
 test_112_012_4 = ARIMAData(
     batch_size=2,
     n_obs=179,
+    n_test=10,
     dataset="passenger_movements",
-    start=160,
-    end=200,
     tolerance_integration=0.001
 )
 
@@ -161,27 +178,38 @@
 test_111_111_12 = ARIMAData(
     batch_size=12,
     n_obs=279,
+    n_test=20,
     dataset="guest_nights_by_region",
-    start=260,
-    end=290,
     tolerance_integration=0.001
 )
 
+# ARIMA(1,1,1)(1,1,1)_12 (missing observations)
+test_111_111_12_missing = ARIMAData(
+    batch_size=12,
+    n_obs=279,
+    n_test=20,
+    dataset="guest_nights_by_region_missing",
+    tolerance_integration=0.03
+)
+
 # Dictionary matching a test case to a tuple of model parameters
 # (a test case could be used with different models)
 # (p, d, q, P, D, Q, s, k) -> ARIMAData
 test_data = [
-    # (1, 0, 1, 0, 0, 0, 0, 1): test_101c,
+    # ((1, 0, 1, 0, 0, 0, 0, 1), test_101c),
     ((0, 0, 2, 0, 0, 0, 0, 1), test_002c),
     ((0, 1, 0, 0, 0, 0, 0, 1), test_010c),
     ((1, 1, 0, 0, 0, 0, 0, 0), test_110),
     ((0, 1, 1, 0, 0, 0, 0, 1), test_011c),
     ((1, 2, 1, 0, 0, 0, 0, 1), test_121c),
+    ((1, 1, 1, 0, 0, 0, 0, 1), test_111c_missing),
     ((1, 0, 1, 1, 1, 1, 4, 0), test_101_111_4),
     ((5, 1, 0, 0, 0, 0, 0, 0), test_510),
     ((1, 1, 1, 2, 0, 0, 4, 1), test_111_200_4c),
+    ((1, 1, 1, 2, 0, 0, 4, 1), test_111_200_4c_missing),
     ((1, 1, 2, 0, 1, 2, 4, 0), test_112_012_4),
     stress_param((1, 1, 1, 1, 1, 1, 12, 0), test_111_111_12),
+    stress_param((1, 1, 1, 1, 1, 1, 12, 0), test_111_111_12_missing),
 ]
 
 # Dictionary for lazy-loading of datasets
@@ -211,8 +239,11 @@ def get_dataset(data, dtype):
         y = pd.read_csv(
             os.path.join(data_path, "{}.csv".format(data.dataset)),
             usecols=range(1, data.batch_size + 1), dtype=dtype)
-        y_cudf = cudf.from_pandas(y)
-        lazy_data[key] = (y, y_cudf)
+        y_train, y_test = train_test_split(y, test_size=data.n_test,
+                                           shuffle=False)
+        y_train_cudf = cudf.from_pandas(y_train).fillna(np.nan)
+        y_test_cudf = cudf.from_pandas(y_test)
+        lazy_data[key] = (y_train, y_train_cudf, y_test, y_test_cudf)
     return lazy_data[key]
 
 
@@ -220,20 +251,64 @@ def get_ref_fit(data, order, seasonal_order, intercept, dtype):
     """Compute a reference fit of a dataset with the given parameters and dtype
     or return a previously computed fit
     """
-    y, _ = get_dataset(data, dtype)
+    y_train, *_ = get_dataset(data, dtype)
     key = order + seasonal_order + \
         (intercept, data.dataset, np.dtype(dtype).name)
     if key not in lazy_ref_fit:
-        ref_model = [sm.tsa.SARIMAX(y[col], order=order,
+        ref_model = [sm.tsa.SARIMAX(y_train[col], order=order,
                                     seasonal_order=seasonal_order,
                                     trend='c' if intercept else 'n')
-                     for col in y.columns]
+                     for col in y_train.columns]
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore")
             lazy_ref_fit[key] = [model.fit(disp=0) for model in ref_model]
     return lazy_ref_fit[key]
 
 
+###############################################################################
+#                              Utility functions                              #
+###############################################################################
+
+def mase(y_train, y_test, y_fc, s):
+    y_train_np = input_to_host_array(y_train).array
+    y_test_np = input_to_host_array(y_test).array
+    y_fc_np = input_to_host_array(y_fc).array
+
+    diff = np.abs(y_train_np[s:] - y_train_np[:-s])
+    scale = np.nanmean(diff, axis=0)
+
+    error = np.abs(y_fc_np - y_test_np).mean(axis=0)
+    return np.mean(error / scale)
+
+
+def fill_interpolation(df_in):
+    np_arr = df_in.to_numpy()
+    for ib in range(np_arr.shape[1]):
+        n = len(np_arr)
+        start, end = -1, 0
+        while start < n - 1:
+            if not np.isnan(np_arr[start+1, ib]):
+                start += 1
+                end = start + 1
+            elif end < n and np.isnan(np_arr[end, ib]):
+                end += 1
+            else:
+                if start == -1:
+                    np_arr[:end, ib] = np_arr[end, ib]
+                elif end == n:
+                    np_arr[start+1:, ib] = np_arr[start, ib]
+                else:
+                    for j in range(start+1, end):
+                        coef = (j - start) / (end - start)
+                        np_arr[j, ib] = (
+                            (1. - coef) * np_arr[start, ib]
+                            + coef * np_arr[end, ib]
+                        )
+                start = end
+                end = start + 1
+    return pd.DataFrame(np_arr, columns=df_in.columns)
+
+
 ###############################################################################
 #                                    Tests                                    #
 ###############################################################################
@@ -241,17 +316,18 @@ def get_ref_fit(data, order, seasonal_order, intercept, dtype):
 @pytest.mark.parametrize('key, data', test_data)
 @pytest.mark.parametrize('dtype', [np.float64])
 def test_integration(key, data, dtype):
-    """Full integration test: estimate, fit, predict (in- and out-of-sample)
+    """Full integration test: estimate, fit, forecast
     """
     order, seasonal_order, intercept = extract_order(key)
+    s = max(1, seasonal_order[3])
 
-    y, y_cudf = get_dataset(data, dtype)
+    y_train, y_train_cudf, y_test, _ = get_dataset(data, dtype)
 
     # Get fit reference model
     ref_fits = get_ref_fit(data, order, seasonal_order, intercept, dtype)
 
     # Create and fit cuML model
-    cuml_model = arima.ARIMA(y_cudf,
+    cuml_model = arima.ARIMA(y_train_cudf,
                              order=order,
                              seasonal_order=seasonal_order,
                              fit_intercept=intercept,
@@ -259,16 +335,16 @@ def test_integration(key, data, dtype):
     cuml_model.fit()
 
     # Predict
-    cuml_pred = cuml_model.predict(data.start, data.end)
-    ref_preds = np.zeros((data.end - data.start, data.batch_size))
+    y_fc_cuml = cuml_model.forecast(data.n_test)
+    y_fc_ref = np.zeros((data.n_test, data.batch_size))
     for i in range(data.batch_size):
-        ref_preds[:, i] = ref_fits[i].get_prediction(
-            data.start, data.end - 1).predicted_mean
+        y_fc_ref[:, i] = ref_fits[i].get_prediction(
+            data.n_train, data.n_obs - 1).predicted_mean
 
-    # Compare results
-    np.testing.assert_allclose(cuml_pred, ref_preds,
-                               rtol=data.tolerance_integration,
-                               atol=data.tolerance_integration)
+    # Compare results: MASE must be better or within the tolerance margin
+    mase_ref = mase(y_train, y_test, y_fc_ref, s)
+    mase_cuml = mase(y_train, y_test, y_fc_cuml, s)
+    assert mase_cuml < mase_ref * (1. + data.tolerance_integration)
 
 
 def _statsmodels_to_cuml(ref_fits, cuml_model, order, seasonal_order,
@@ -297,13 +373,13 @@ def _predict_common(key, data, dtype, start, end, num_steps=None, level=None,
     """
     order, seasonal_order, intercept = extract_order(key)
 
-    y, y_cudf = get_dataset(data, dtype)
+    _, y_train_cudf, *_ = get_dataset(data, dtype)
 
     # Get fit reference model
     ref_fits = get_ref_fit(data, order, seasonal_order, intercept, dtype)
 
     # Create cuML model
-    cuml_model = arima.ARIMA(y_cudf,
+    cuml_model = arima.ARIMA(y_train_cudf,
                              order=order,
                              seasonal_order=seasonal_order,
                              fit_intercept=intercept,
@@ -349,38 +425,44 @@ def _predict_common(key, data, dtype, start, end, num_steps=None, level=None,
 @pytest.mark.parametrize('key, data', test_data)
 @pytest.mark.parametrize('dtype', [np.float64])
 @pytest.mark.parametrize('simple_differencing', [True, False])
-def test_predict(key, data, dtype, simple_differencing):
+def test_predict_in(key, data, dtype, simple_differencing):
     """Test in-sample prediction against statsmodels (with the same values
     for the model parameters)
     """
-    n_obs = data.n_obs
-    _predict_common(key, data, dtype, n_obs // 2, n_obs,
+    _predict_common(key, data, dtype, data.n_train // 2, data.n_obs,
                     simple_differencing=simple_differencing)
 
 
 @pytest.mark.parametrize('key, data', test_data)
 @pytest.mark.parametrize('dtype', [np.float64])
-@pytest.mark.parametrize('num_steps', [10])
 @pytest.mark.parametrize('simple_differencing', [True, False])
-def test_forecast(key, data, dtype, num_steps, simple_differencing):
+def test_predict_inout(key, data, dtype, simple_differencing):
+    """Test in- and ouf-of-sample prediction against statsmodels (with the
+    same values for the model parameters)
+    """
+    _predict_common(key, data, dtype, data.n_train // 2, data.n_train,
+                    simple_differencing=simple_differencing)
+
+
+@pytest.mark.parametrize('key, data', test_data)
+@pytest.mark.parametrize('dtype', [np.float64])
+@pytest.mark.parametrize('simple_differencing', [True, False])
+def test_forecast(key, data, dtype, simple_differencing):
     """Test out-of-sample forecasting against statsmodels (with the same
     values for the model parameters)
     """
-    n_obs = data.n_obs
-    _predict_common(key, data, dtype, n_obs, n_obs + num_steps, num_steps,
+    _predict_common(key, data, dtype, data.n_train, data.n_obs, data.n_test,
                     simple_differencing=simple_differencing)
 
 
 @pytest.mark.parametrize('key, data', test_data)
 @pytest.mark.parametrize('dtype', [np.float64])
-@pytest.mark.parametrize('num_steps', [10])
 @pytest.mark.parametrize('level', [0.5, 0.95])
-def test_intervals(key, data, dtype, num_steps, level):
+def test_intervals(key, data, dtype, level):
     """Test forecast confidence intervals against statsmodels (with the same
     values for the model parameters)
     """
-    n_obs = data.n_obs
-    _predict_common(key, data, dtype, n_obs, n_obs + num_steps, num_steps,
+    _predict_common(key, data, dtype, data.n_train, data.n_obs, data.n_test,
                     level)
 
 
@@ -393,13 +475,13 @@ def test_loglikelihood(key, data, dtype, simple_differencing):
     """
     order, seasonal_order, intercept = extract_order(key)
 
-    y, y_cudf = get_dataset(data, dtype)
+    _, y_train_cudf, *_ = get_dataset(data, dtype)
 
     # Get fit reference model
     ref_fits = get_ref_fit(data, order, seasonal_order, intercept, dtype)
 
     # Create cuML model
-    cuml_model = arima.ARIMA(y_cudf,
+    cuml_model = arima.ARIMA(y_train_cudf,
                              order=order,
                              seasonal_order=seasonal_order,
                              fit_intercept=intercept,
@@ -432,10 +514,10 @@ def test_gradient(key, data, dtype):
     N = p + P + q + Q + intercept + 1
     h = 1e-8
 
-    _, y_cudf = get_dataset(data, dtype)
+    _, y_train_cudf, *_ = get_dataset(data, dtype)
 
     # Create cuML model
-    cuml_model = arima.ARIMA(y_cudf,
+    cuml_model = arima.ARIMA(y_train_cudf,
                              order=order,
                              seasonal_order=seasonal_order,
                              fit_intercept=intercept)
@@ -451,7 +533,7 @@ def test_gradient(key, data, dtype):
     scipy_grad = np.zeros(N * data.batch_size)
     for i in range(data.batch_size):
         # Create a model with only the current series
-        model_i = arima.ARIMA(y_cudf[y_cudf.columns[i]],
+        model_i = arima.ARIMA(y_train_cudf[y_train_cudf.columns[i]],
                               order=order,
                               seasonal_order=seasonal_order,
                               fit_intercept=intercept)
@@ -473,17 +555,20 @@ def test_start_params(key, data, dtype):
     """
     order, seasonal_order, intercept = extract_order(key)
 
-    y, y_cudf = get_dataset(data, dtype)
+    y_train, y_train_cudf, *_ = get_dataset(data, dtype)
+
+    # fillna for reference to match cuML initial estimation strategy
+    y_train_nona = fill_interpolation(y_train)
 
     # Create models
-    cuml_model = arima.ARIMA(y_cudf,
+    cuml_model = arima.ARIMA(y_train_cudf,
                              order=order,
                              seasonal_order=seasonal_order,
                              fit_intercept=intercept)
-    ref_model = [sm.tsa.SARIMAX(y[col], order=order,
+    ref_model = [sm.tsa.SARIMAX(y_train_nona[col], order=order,
                                 seasonal_order=seasonal_order,
                                 trend='c' if intercept else 'n')
-                 for col in y.columns]
+                 for col in y_train_nona.columns]
 
     # Estimate reference starting parameters
     N = cuml_model.complexity
diff --git a/python/cuml/test/test_benchmark.py b/python/cuml/test/test_benchmark.py
index 1a5b57f902..3e1fbe1d85 100644
--- a/python/cuml/test/test_benchmark.py
+++ b/python/cuml/test/test_benchmark.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -170,8 +170,9 @@ def predict(self, X):
 
 # Only test a few algorithms (which collectively span several types)
 # to reduce runtime burden
-@pytest.mark.parametrize('algo_name', ['UMAP-Supervised',
-                                       'DBSCAN',
+# skipping UMAP-Supervised due to issue
+# https://github.com/rapidsai/cuml/issues/4243
+@pytest.mark.parametrize('algo_name', ['DBSCAN',
                                        'LogisticRegression',
                                        'ElasticNet',
                                        'FIL'])
@@ -183,7 +184,7 @@ def test_real_algos_runner(algo_name):
         pytest.xfail()
 
     runner = AccuracyComparisonRunner(
-        [20], [5], dataset_name='classification', test_fraction=0.20
+        [50], [5], dataset_name='classification', test_fraction=0.20
     )
     results = runner.run(pair)[0]
     print(results)
diff --git a/python/cuml/test/test_dbscan.py b/python/cuml/test/test_dbscan.py
index 3f0a8d152c..37708a0fcd 100644
--- a/python/cuml/test/test_dbscan.py
+++ b/python/cuml/test/test_dbscan.py
@@ -14,6 +14,7 @@
 #
 
 import numpy as np
+from numpy.testing import assert_raises
 import pytest
 
 from cuml.test.utils import get_handle
@@ -366,3 +367,11 @@ def test_dbscan_no_calc_core_point_indices():
 
     # Make sure we are None
     assert(cuml_dbscan.core_sample_indices_ is None)
+
+
+def test_dbscan_on_empty_array():
+
+    X = np.array([])
+    cuml_dbscan = cuDBSCAN()
+
+    assert_raises(ValueError, cuml_dbscan.fit, X)
diff --git a/python/cuml/test/test_fil.py b/python/cuml/test/test_fil.py
index a72d6b47b2..be460e7b68 100644
--- a/python/cuml/test/test_fil.py
+++ b/python/cuml/test/test_fil.py
@@ -16,6 +16,8 @@
 import numpy as np
 import pytest
 import os
+import pandas as pd
+from random import sample, seed
 
 from cuml import ForestInference
 from cuml.test.utils import array_equal, unit_param, \
@@ -35,18 +37,21 @@
     import xgboost as xgb
 
 
-def simulate_data(m, n, k=2, random_state=None, classification=True,
-                  bias=0.0):
+def simulate_data(m, n, k=2, n_informative='auto', random_state=None,
+                  classification=True, bias=0.0):
+    if n_informative == 'auto':
+        n_informative = n // 5
     if classification:
         features, labels = make_classification(n_samples=m,
                                                n_features=n,
-                                               n_informative=int(n/5),
+                                               n_informative=n_informative,
+                                               n_redundant=n - n_informative,
                                                n_classes=k,
                                                random_state=random_state)
     else:
         features, labels = make_regression(n_samples=m,
                                            n_features=n,
-                                           n_informative=int(n/5),
+                                           n_informative=n_informative,
                                            n_targets=1,
                                            bias=bias,
                                            random_state=random_state)
@@ -486,16 +491,59 @@ def test_output_args(small_classifier_and_preds):
     assert array_equal(fil_preds, xgb_preds, 1e-3)
 
 
+def to_categorical(features, n_categorical):
+    # the main bottleneck (>80%) of to_categorical() is the pandas operations
+    n_features = features.shape[1]
+    df_cols = {}
+    # all categorical columns
+    cat_cols = features[:, :n_categorical]
+    cat_cols = cat_cols - cat_cols.min(axis=1, keepdims=True)  # range [0, ?]
+    cat_cols /= cat_cols.max(axis=1, keepdims=True)  # range [0, 1]
+    rough_n_categories = 100
+    # round into rough_n_categories bins
+    cat_cols = (cat_cols * rough_n_categories).astype(int)
+    for icol in range(n_categorical):
+        col = cat_cols[:, icol]
+        df_cols[icol] = pd.Series(pd.Categorical(col,
+                                                 categories=np.unique(col)))
+    # all numerical columns
+    for icol in range(n_categorical, n_features):
+        df_cols[icol] = pd.Series(features[:, icol])
+    # shuffle the columns around
+    seed(42)
+    new_idx = sample(range(n_features), k=n_features)
+    df_cols = {i: df_cols[new_idx[i]] for i in range(n_features)}
+
+    return pd.DataFrame(df_cols)
+
+
 @pytest.mark.parametrize('num_classes', [2, 5])
+@pytest.mark.parametrize('n_categorical', [0, 5])
 @pytest.mark.skipif(has_lightgbm() is False, reason="need to install lightgbm")
-def test_lightgbm(tmp_path, num_classes):
+def test_lightgbm(tmp_path, num_classes, n_categorical):
     import lightgbm as lgb
-    X, y = simulate_data(500,
-                         10 if num_classes == 2 else 50,
+
+    if n_categorical > 0:
+        n_features = 10
+        n_rows = 1000
+        n_informative = n_features
+    else:
+        n_features = 10 if num_classes == 2 else 50
+        n_rows = 500
+        n_informative = 'auto'
+
+    X, y = simulate_data(n_rows,
+                         n_features,
                          num_classes,
+                         n_informative=n_informative,
                          random_state=43210,
                          classification=True)
-    train_data = lgb.Dataset(X, label=y)
+    if n_categorical > 0:
+        X_fit = to_categorical(X, n_categorical)
+    else:
+        X_fit = X
+
+    train_data = lgb.Dataset(X_fit, label=y)
     num_round = 5
     model_path = str(os.path.join(tmp_path, 'lgb.model'))
 
@@ -522,7 +570,7 @@ def test_lightgbm(tmp_path, num_classes):
         lgm = lgb.LGBMClassifier(objective='multiclass',
                                  boosting_type='gbdt',
                                  n_estimators=num_round)
-        lgm.fit(X, y)
+        lgm.fit(X_fit, y)
         lgm.booster_.save_model(model_path)
         fm = ForestInference.load(model_path,
                                   algo='TREE_REORG',
diff --git a/python/cuml/test/test_metrics.py b/python/cuml/test/test_metrics.py
index 04ca6e97ff..9230e9b018 100644
--- a/python/cuml/test/test_metrics.py
+++ b/python/cuml/test/test_metrics.py
@@ -79,6 +79,8 @@
 from cuml.metrics import pairwise_distances, sparse_pairwise_distances, \
     PAIRWISE_DISTANCE_METRICS, PAIRWISE_DISTANCE_SPARSE_METRICS
 from sklearn.metrics import pairwise_distances as sklearn_pairwise_distances
+from scipy.spatial import distance as scipy_pairwise_distances
+from scipy.special import rel_entr as scipy_kl_divergence
 
 
 @pytest.fixture(scope='module')
@@ -867,19 +869,29 @@ def test_log_loss_at_limits():
         log_loss(y_true, y_pred)
 
 
-def ref_dense_pairwise_dist(X, Y=None, metric=None):
+def naive_kl_divergence_dist(X, Y):
+    return 0.5 * np.array([[np.sum(np.where(yj != 0,
+                          scipy_kl_divergence(xi, yj), 0.0)) for yj in Y]
+                          for xi in X])
+
+
+def ref_dense_pairwise_dist(X, Y=None, metric=None, convert_dtype=False):
     # Select sklearn except for Hellinger that
     # sklearn doesn't support
     if Y is None:
         Y = X
     if metric == "hellinger":
         return naive_hellinger(X, Y)
+    elif metric == "jensenshannon":
+        return scipy_pairwise_distances.cdist(X, Y, 'jensenshannon')
+    elif metric == "kldivergence":
+        return naive_kl_divergence_dist(X, Y)
     else:
         return sklearn_pairwise_distances(X, Y, metric)
 
 
 def prep_dense_array(array, metric, col_major=0):
-    if metric == "hellinger":
+    if metric in ['hellinger', 'jensenshannon', 'kldivergence']:
         norm_array = preprocessing.normalize(array, norm="l1")
         return np.asfortranarray(norm_array) if col_major else norm_array
     else:
@@ -940,11 +952,12 @@ def test_pairwise_distances(metric: str, matrix_size, is_col_major):
     cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)
 
     # Test sending an int type with convert_dtype=True
-    Y = prep_dense_array(rng.randint(10, size=Y.shape),
-                         metric=metric, col_major=is_col_major)
-    S = pairwise_distances(X, Y, metric=metric, convert_dtype=True)
-    S2 = ref_dense_pairwise_dist(X, Y, metric=metric)
-    cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)
+    if metric != 'kldivergence':
+        Y = prep_dense_array(rng.randint(10, size=Y.shape),
+                             metric=metric, col_major=is_col_major)
+        S = pairwise_distances(X, Y, metric=metric, convert_dtype=True)
+        S2 = ref_dense_pairwise_dist(X, Y, metric=metric, convert_dtype=True)
+        cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)
 
     # Test that uppercase on the metric name throws an error.
     with pytest.raises(ValueError):
@@ -1442,3 +1455,11 @@ def test_kl_divergence(nfeatures, input_type, dtypeP, dtypeQ):
         cu_res = cu_kl_divergence(P, Q, convert_dtype=False)
 
     cp.testing.assert_array_almost_equal(cu_res, sk_res)
+
+
+def test_mean_squared_error():
+    y1 = np.array([[1], [2], [3]])
+    y2 = y1.squeeze()
+
+    assert mean_squared_error(y1, y2) == 0
+    assert mean_squared_error(y2, y1) == 0
diff --git a/python/cuml/test/test_naive_bayes.py b/python/cuml/test/test_naive_bayes.py
index 1abd046656..0f704f8ad2 100644
--- a/python/cuml/test/test_naive_bayes.py
+++ b/python/cuml/test/test_naive_bayes.py
@@ -21,6 +21,7 @@
 from sklearn.metrics import accuracy_score
 from cuml.naive_bayes import MultinomialNB
 from cuml.naive_bayes import BernoulliNB
+from cuml.naive_bayes import CategoricalNB
 from cuml.naive_bayes import GaussianNB
 from cuml.common.input_utils import sparse_scipy_to_cp
 
@@ -28,6 +29,7 @@
 from numpy.testing import assert_array_almost_equal, assert_raises
 from sklearn.naive_bayes import MultinomialNB as skNB
 from sklearn.naive_bayes import BernoulliNB as skBNB
+from sklearn.naive_bayes import CategoricalNB as skCNB
 from sklearn.naive_bayes import GaussianNB as skGNB
 
 import math
@@ -280,13 +282,14 @@ def test_bernoulli(x_dtype, y_dtype, is_sparse, nlp_20news):
                                      cp.float32, cp.float64])
 def test_bernoulli_partial_fit(x_dtype, y_dtype, nlp_20news):
     chunk_size = 500
+    n_rows = 1500
 
     X, y = nlp_20news
 
     X = sparse_scipy_to_cp(X, x_dtype).astype(x_dtype)
-    y = y.astype(y_dtype)
+    y = y.astype(y_dtype)[:n_rows]
 
-    X = X.tocsr()
+    X = X.tocsr()[:n_rows]
 
     model = BernoulliNB()
     modelsk = skBNB()
@@ -356,10 +359,10 @@ def test_gaussian_fit_predict(x_dtype, y_dtype, is_sparse,
 
     X, y = nlp_20news
     model = GaussianNB()
-    n_rows = 1000
-
+    n_rows = 500
+    n_cols = int(2e5)
     X = sparse_scipy_to_cp(X, x_dtype)
-    X = X.tocsr()[:n_rows]
+    X = X.tocsr()[:n_rows, :n_cols]
 
     if is_sparse:
         y = y.astype(y_dtype)[:n_rows]
@@ -378,8 +381,8 @@ def test_gaussian_fit_predict(x_dtype, y_dtype, is_sparse,
 
 @pytest.mark.xfail(reason="This test requires an update (see #4180)")
 def test_gaussian_partial_fit(nlp_20news):
-    chunk_size = 200
-    n_rows = 1000
+    chunk_size = 250
+    n_rows = 1500
     x_dtype, y_dtype = cp.float32, cp.int32
 
     X, y = nlp_20news
@@ -388,7 +391,6 @@ def test_gaussian_partial_fit(nlp_20news):
     y = y.astype(y_dtype)[:n_rows]
 
     model = GaussianNB()
-    modelsk = skGNB()
 
     classes = np.unique(y)
 
@@ -407,23 +409,17 @@ def test_gaussian_partial_fit(nlp_20news):
             x = X[i*chunk_size:]
             y_c = y[i*chunk_size:]
 
-        modelsk.partial_fit(x.get().toarray(),
-                            y_c.get(),
-                            classes=classes.get())
         model.partial_fit(x, y_c, classes=classes)
 
         total_fit += (upper - (i*chunk_size))
-
         if upper == -1:
             break
 
     y_hat = model.predict(X)
-    y_sk = modelsk.predict(X.get().toarray())
 
     y_hat = cp.asnumpy(y_hat)
     y = cp.asnumpy(y)
-    assert_array_equal(y_hat, y_sk)
-    assert accuracy_score(y, y_hat) >= 0.924
+    assert accuracy_score(y, y_hat) >= 0.99
 
     # Test whether label mismatch between target y and classes raises an Error
     assert_raises(ValueError,
@@ -462,3 +458,129 @@ def test_gaussian_parameters(priors, var_smoothing, nlp_20news):
 
     assert_allclose(model.epsilon_.get(), model_sk.epsilon_, rtol=1e-4)
     assert_array_equal(y_hat, y_hat_sk)
+
+
+@pytest.mark.parametrize("x_dtype", [cp.int32, cp.float32, cp.float64])
+@pytest.mark.parametrize("y_dtype", [cp.int32, cp.int64])
+@pytest.mark.parametrize("is_sparse", [True, False])
+def test_categorical(x_dtype, y_dtype, is_sparse, nlp_20news):
+    if x_dtype == cp.int32 and is_sparse:
+        pytest.skip("Sparse matrices with integers dtype are not supported")
+    X, y = nlp_20news
+    n_rows = 2000
+    n_cols = 500
+
+    X = sparse_scipy_to_cp(X, dtype=cp.float32)
+    X = X.tocsr()[:n_rows, :n_cols]
+    y = y.astype(y_dtype)[:n_rows]
+
+    if not is_sparse:
+        X = X.todense()
+    X = X.astype(x_dtype)
+    cuml_model = CategoricalNB()
+    cuml_model.fit(X, y)
+    cuml_score = cuml_model.score(X, y)
+    cuml_proba = cuml_model.predict_log_proba(X).get()
+
+    X = X.todense().get() if is_sparse else X.get()
+    y = y.get()
+    sk_model = skCNB()
+    sk_model.fit(X, y)
+    sk_score = sk_model.score(X, y)
+    sk_proba = sk_model.predict_log_proba(X)
+
+    THRES = 1e-3
+
+    assert_array_equal(sk_model.class_count_, cuml_model.class_count_.get())
+    assert_allclose(sk_model.class_log_prior_,
+                    cuml_model.class_log_prior_.get(), 1e-6)
+    assert_allclose(cuml_proba, sk_proba, atol=1e-2, rtol=1e-2)
+    assert sk_score - THRES <= cuml_score <= sk_score + THRES
+
+
+@pytest.mark.parametrize("x_dtype", [cp.int32,
+                                     cp.float32, cp.float64])
+@pytest.mark.parametrize("y_dtype", [cp.int32, cp.int64])
+@pytest.mark.parametrize("is_sparse", [True, False])
+def test_categorical_partial_fit(x_dtype, y_dtype, is_sparse, nlp_20news):
+    if x_dtype == cp.int32 and is_sparse:
+        pytest.skip("Sparse matrices with integers dtype are not supported")
+    n_rows = 5000
+    n_cols = 500
+    chunk_size = 1000
+
+    X, y = nlp_20news
+
+    X = sparse_scipy_to_cp(X, 'float32').tocsr()[:n_rows]
+    if is_sparse:
+        X.data = X.data.astype(x_dtype)
+        expected_score = 0.5414
+    else:
+        X = X[:, :n_cols].todense().astype(x_dtype)
+        expected_score = 0.1040
+    y = y.astype(y_dtype)[:n_rows]
+
+    model = CategoricalNB()
+
+    classes = np.unique(y)
+    for i in range(math.ceil(X.shape[0] / chunk_size)):
+
+        upper = i*chunk_size+chunk_size
+        if upper > X.shape[0]:
+            upper = -1
+
+        if upper > 0:
+            x = X[i*chunk_size:upper]
+            y_c = y[i*chunk_size:upper]
+        else:
+            x = X[i*chunk_size:]
+            y_c = y[i*chunk_size:]
+        model.partial_fit(x, y_c, classes=classes)
+        if upper == -1:
+            break
+
+    cuml_score = model.score(X, y)
+    THRES = 1e-4
+    assert expected_score - THRES <= cuml_score <= expected_score + THRES
+
+
+@pytest.mark.parametrize("class_prior", [None, 'balanced', 'unbalanced'])
+@pytest.mark.parametrize("alpha", [0.1, 0.5, 1.5])
+@pytest.mark.parametrize("fit_prior", [False, True])
+@pytest.mark.parametrize("is_sparse", [False, True])
+def test_categorical_parameters(class_prior, alpha, fit_prior,
+                                is_sparse, nlp_20news):
+    x_dtype = cp.float32
+    y_dtype = cp.int32
+    nrows = 2000
+    ncols = 500
+
+    X, y = nlp_20news
+
+    X = sparse_scipy_to_cp(X, x_dtype).tocsr()[:nrows, :ncols]
+    if not is_sparse:
+        X = X.todense()
+    y = y.astype(y_dtype)[:nrows]
+
+    if class_prior == 'balanced':
+        class_prior = np.array([1/20] * 20)
+    elif class_prior == 'unbalanced':
+        class_prior = np.linspace(0.01, 0.09, 20)
+
+    model = CategoricalNB(class_prior=class_prior,
+                          alpha=alpha,
+                          fit_prior=fit_prior)
+    model_sk = skCNB(class_prior=class_prior,
+                     alpha=alpha,
+                     fit_prior=fit_prior)
+    model.fit(X, y)
+    y_hat = model.predict(X).get()
+    y_log_prob = model.predict_log_proba(X).get()
+
+    X = X.todense().get() if is_sparse else X.get()
+    model_sk.fit(X, y.get())
+    y_hat_sk = model_sk.predict(X)
+    y_log_prob_sk = model_sk.predict_log_proba(X)
+
+    assert_allclose(y_log_prob, y_log_prob_sk, rtol=1e-4)
+    assert_array_equal(y_hat, y_hat_sk)
diff --git a/python/cuml/test/test_nearest_neighbors.py b/python/cuml/test/test_nearest_neighbors.py
index 534f2e0d64..b5c4047a69 100644
--- a/python/cuml/test/test_nearest_neighbors.py
+++ b/python/cuml/test/test_nearest_neighbors.py
@@ -25,6 +25,7 @@
 from cuml.datasets import make_blobs
 
 from sklearn.metrics import pairwise_distances
+from cuml.metrics import pairwise_distances as cuPW
 
 from cuml.common import logger
 
@@ -219,11 +220,11 @@ def test_ivfflat_pred(nrows, ncols, n_neighbors, nlist):
 
 
 @pytest.mark.parametrize("nlist", [8])
-@pytest.mark.parametrize("M", [16, 32])
-@pytest.mark.parametrize("n_bits", [4, 6])
+@pytest.mark.parametrize("M", [32])
+@pytest.mark.parametrize("n_bits", [4])
 @pytest.mark.parametrize("usePrecomputedTables", [False, True])
 @pytest.mark.parametrize("nrows", [4000])
-@pytest.mark.parametrize("ncols", [128, 512])
+@pytest.mark.parametrize("ncols", [64, 512])
 @pytest.mark.parametrize("n_neighbors", [8])
 def test_ivfpq_pred(nrows, ncols, n_neighbors,
                     nlist, M, n_bits, usePrecomputedTables):
@@ -327,7 +328,8 @@ def test_return_dists():
 @pytest.mark.parametrize('nrows', [unit_param(500), quality_param(5000),
                          stress_param(70000)])
 @pytest.mark.parametrize('n_feats', [unit_param(3), stress_param(1000)])
-@pytest.mark.parametrize('k', [unit_param(3), stress_param(50)])
+@pytest.mark.parametrize('k', [unit_param(3),
+                               stress_param(50)])
 @pytest.mark.parametrize("metric", valid_metrics())
 def test_knn_separate_index_search(input_type, nrows, n_feats, k, metric):
     X, _ = make_blobs(n_samples=nrows,
@@ -381,8 +383,10 @@ def test_knn_separate_index_search(input_type, nrows, n_feats, k, metric):
 
 @pytest.mark.parametrize('input_type', ['dataframe', 'ndarray'])
 @pytest.mark.parametrize('nrows', [unit_param(500), stress_param(70000)])
-@pytest.mark.parametrize('n_feats', [unit_param(3), stress_param(1000)])
-@pytest.mark.parametrize('k', [unit_param(3), stress_param(50)])
+@pytest.mark.parametrize('n_feats', [unit_param(3),
+                                     stress_param(1000)])
+@pytest.mark.parametrize('k', [unit_param(3), unit_param(35),
+                               stress_param(50)])
 @pytest.mark.parametrize("metric", valid_metrics())
 def test_knn_x_none(input_type, nrows, n_feats, k, metric):
     X, _ = make_blobs(n_samples=nrows,
@@ -468,10 +472,11 @@ def test_nn_downcast_fails(input_type, nrows, n_feats):
     ("ndarray", "connectivity", "cupy", False),
     ("ndarray", "distance", "numpy", False),
     ])
-@pytest.mark.parametrize('nrows', [unit_param(10), stress_param(1000)])
+@pytest.mark.parametrize('nrows', [unit_param(100), stress_param(1000)])
 @pytest.mark.parametrize('n_feats', [unit_param(5), stress_param(100)])
 @pytest.mark.parametrize("p", [2, 5])
-@pytest.mark.parametrize('k', [unit_param(3), stress_param(30)])
+@pytest.mark.parametrize('k', [unit_param(3), unit_param(35),
+                               stress_param(30)])
 @pytest.mark.parametrize("metric", valid_metrics())
 def test_knn_graph(input_type, mode, output_type, as_instance,
                    nrows, n_feats, p, k, metric):
@@ -511,6 +516,47 @@ def test_knn_graph(input_type, mode, output_type, as_instance,
         assert isspmatrix_csr(sparse_cu)
 
 
+@pytest.mark.parametrize('distance', ["euclidean", "haversine"])
+@pytest.mark.parametrize('n_neighbors', [4, 25])
+@pytest.mark.parametrize('nrows', [unit_param(10000), stress_param(70000)])
+def test_nearest_neighbors_rbc(distance, n_neighbors, nrows):
+    X, y = make_blobs(n_samples=nrows,
+                      centers=25,
+                      shuffle=True,
+                      n_features=2,
+                      cluster_std=3.0,
+                      random_state=42)
+
+    knn_cu = cuKNN(metric=distance, algorithm="rbc")
+    knn_cu.fit(X)
+
+    query_rows = int(nrows/2)
+
+    rbc_d, rbc_i = knn_cu.kneighbors(X[:query_rows, :],
+                                     n_neighbors=n_neighbors)
+
+    if distance == 'euclidean':
+        # Need to use unexpanded euclidean distance
+        pw_dists = cuPW(X, metric="l2")
+        brute_i = cp.argsort(pw_dists, axis=1)[:query_rows, :n_neighbors]
+        brute_d = cp.sort(pw_dists, axis=1)[:query_rows, :n_neighbors]
+    else:
+        knn_cu_brute = cuKNN(metric=distance, algorithm="brute")
+        knn_cu_brute.fit(X)
+
+        brute_d, brute_i = knn_cu_brute.kneighbors(
+            X[:query_rows, :], n_neighbors=n_neighbors)
+
+    rbc_i = cp.sort(rbc_i, axis=1)
+    brute_i = cp.sort(brute_i, axis=1)
+
+    # TODO: These are failing with 1 or 2 mismatched elements
+    # for very small values of k:
+    # https://github.com/rapidsai/cuml/issues/4262
+    assert len(brute_d[brute_d != rbc_d]) <= 1
+    assert len(brute_i[brute_i != rbc_i]) <= 1
+
+
 @pytest.mark.parametrize("metric", valid_metrics_sparse())
 @pytest.mark.parametrize(
     'nrows,ncols,density,n_neighbors,batch_size_index,batch_size_query',
@@ -559,7 +605,9 @@ def test_nearest_neighbors_sparse(metric,
 
     skD, skI = sknn.kneighbors(b.get())
 
-    cp.testing.assert_allclose(cuD, skD, atol=1e-3, rtol=1e-3)
+    # For some reason, this will occasionally fail w/ a single
+    # mismatched element in CI. Allowing the single mismatch for now.
+    cp.testing.assert_allclose(cuD, skD, atol=1e-5, rtol=1e-5)
 
     # Jaccard & Chebyshev have a high potential for mismatched indices
     # due to duplicate distances. We can ignore the indices in this case.
diff --git a/python/cuml/test/test_random_forest.py b/python/cuml/test/test_random_forest.py
index 38fb02216a..a794162bfc 100644
--- a/python/cuml/test/test_random_forest.py
+++ b/python/cuml/test/test_random_forest.py
@@ -14,13 +14,12 @@
 #
 import pytest
 
+import warnings
 import cudf
 import numpy as np
 import random
 import json
-import io
 import os
-from contextlib import redirect_stdout
 
 from numba import cuda
 
@@ -33,7 +32,8 @@
 
 from sklearn.ensemble import RandomForestClassifier as skrfc
 from sklearn.ensemble import RandomForestRegressor as skrfr
-from sklearn.metrics import accuracy_score, mean_squared_error
+from sklearn.metrics import accuracy_score, mean_squared_error, \
+    mean_tweedie_deviance
 from sklearn.datasets import fetch_california_housing, \
     make_classification, make_regression, load_iris, load_breast_cancer, \
     load_boston
@@ -192,6 +192,63 @@ def special_reg(request):
     return X, y
 
 
+@pytest.mark.parametrize("max_depth", [2, 4])
+@pytest.mark.parametrize("split_criterion",
+                         ["poisson", "gamma", "inverse_gaussian"])
+def test_tweedie_convergence(max_depth, split_criterion):
+    np.random.seed(33)
+    bootstrap = None
+    max_features = 1.0
+    n_estimators = 1
+    min_impurity_decrease = 1e-5
+    n_datapoints = 1000
+    tweedie = {
+        "poisson":
+            {"power": 1,
+             "gen": np.random.poisson, "args": [0.01]},
+        "gamma":
+            {"power": 2,
+             "gen": np.random.gamma, "args": [2.0]},
+        "inverse_gaussian":
+            {"power": 3,
+             "gen": np.random.wald, "args": [0.1, 2.0]}
+    }
+    # generating random dataset with tweedie distribution
+    X = np.random.random((n_datapoints, 4)).astype(np.float32)
+    y = tweedie[split_criterion]["gen"](*tweedie[split_criterion]["args"],
+                                        size=n_datapoints).astype(np.float32)
+
+    tweedie_preds = curfr(
+        split_criterion=split_criterion,
+        max_depth=max_depth,
+        n_estimators=n_estimators,
+        bootstrap=bootstrap,
+        max_features=max_features,
+        min_impurity_decrease=min_impurity_decrease).fit(X, y).predict(X)
+    mse_preds = curfr(
+        split_criterion=2,
+        max_depth=max_depth,
+        n_estimators=n_estimators,
+        bootstrap=bootstrap,
+        max_features=max_features,
+        min_impurity_decrease=min_impurity_decrease).fit(X, y).predict(X)
+    # y should not be non-positive for mean_poisson_deviance
+    mask = mse_preds > 0
+    mse_tweedie_deviance = mean_tweedie_deviance(y[mask],
+                                                 mse_preds[mask],
+                                                 power=tweedie
+                                                 [split_criterion]["power"])
+    tweedie_tweedie_deviance = mean_tweedie_deviance(y[mask],
+                                                     tweedie_preds[mask],
+                                                     power=tweedie
+                                                     [split_criterion]["power"]
+                                                     )
+
+    # model trained on tweedie data with
+    # tweedie criterion must perform better on tweedie loss
+    assert mse_tweedie_deviance >= tweedie_tweedie_deviance
+
+
 @pytest.mark.parametrize(
     "max_samples", [unit_param(1.0), quality_param(0.90), stress_param(0.95)]
 )
@@ -224,9 +281,7 @@ def test_rf_classification(small_clf, datatype, max_samples, max_features):
         max_leaves=-1,
         max_depth=16,
     )
-    f = io.StringIO()
-    with redirect_stdout(f):
-        cuml_model.fit(X_train, y_train)
+    cuml_model.fit(X_train, y_train)
 
     fil_preds = cuml_model.predict(
         X_test, predict_model="GPU", threshold=0.5, algo="auto"
@@ -406,11 +461,22 @@ def test_rf_classification_float64(small_clf, datatype, convert_dtype):
 
         fil_acc = accuracy_score(y_test, fil_preds)
         assert fil_acc >= (cu_acc - 0.07)  # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa
-    else:
-        with pytest.raises(TypeError):
+    # if GPU predict cannot be used, display warning and use CPU predict
+    elif datatype[1] == np.float64:
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
             fil_preds = cuml_model.predict(
-                X_test, predict_model="GPU", convert_dtype=convert_dtype
+                X_test, predict_model="GPU",
+                convert_dtype=convert_dtype
             )
+            assert("GPU based predict only accepts "
+                   "np.float32 data. The model was "
+                   "trained on np.float64 data hence "
+                   "cannot use GPU-based prediction! "
+                   "\nDefaulting to CPU-based Prediction. "
+                   "\nTo predict on float-64 data, set "
+                   "parameter predict_model = 'CPU'"
+                   in str(w[-1].message))
 
 
 @pytest.mark.parametrize(
@@ -454,10 +520,21 @@ def test_rf_regression_float64(large_reg, datatype):
         assert fil_r2 >= (cu_r2 - 0.02)
 
     #  because datatype[0] != np.float32 or datatype[0] != datatype[1]
-    with pytest.raises(TypeError):
-        fil_preds = cuml_model.predict(
-            X_test, predict_model="GPU", convert_dtype=False
-        )
+    # display warning when GPU-predict cannot be used and revert to CPU-predict
+    elif datatype[1] == np.float64:
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            fil_preds = cuml_model.predict(
+                X_test, predict_model="GPU"
+                )
+            assert("GPU based predict only accepts "
+                   "np.float32 data. The model was "
+                   "trained on np.float64 data hence "
+                   "cannot use GPU-based prediction! "
+                   "\nDefaulting to CPU-based Prediction. "
+                   "\nTo predict on float-64 data, set "
+                   "parameter predict_model = 'CPU'"
+                   in str(w[-1].message))
 
 
 def check_predict_proba(test_proba, baseline_proba, y_test, rel_err):
@@ -998,25 +1075,23 @@ def predict_with_json_tree(tree, x):
     def predict_with_json_rf_classifier(rf, x):
         # Returns the class with the highest vote. If there is a tie, return
         # the list of all classes with the highest vote.
-        vote = []
+        predictions = []
         for tree in rf:
-            vote.append(predict_with_json_tree(tree, x))
-        vote = np.bincount(vote)
-        max_vote = np.max(vote)
-        majority_vote = np.nonzero(np.equal(vote, max_vote))[0]
-        return majority_vote
+            predictions.append(np.array(predict_with_json_tree(tree, x)))
+        predictions = np.sum(predictions, axis=0)
+        return np.argmax(predictions)
 
     def predict_with_json_rf_regressor(rf, x):
         pred = 0.0
         for tree in rf:
-            pred += predict_with_json_tree(tree, x)
+            pred += predict_with_json_tree(tree, x)[0]
         return pred / len(rf)
 
     if estimator_type == "classification":
         expected_pred = cuml_model.predict(X).astype(np.int32)
         for idx, row in enumerate(X):
             majority_vote = predict_with_json_rf_classifier(json_obj, row)
-            assert expected_pred[idx] in majority_vote
+            assert expected_pred[idx] == majority_vote
     elif estimator_type == "regression":
         expected_pred = cuml_model.predict(X).astype(np.float32)
         pred = []
@@ -1134,9 +1209,7 @@ def test_concat_memory_leak(large_clf, estimator_type):
     assert (used_mem - initial_baseline_mem) < 1e6
 
 
-@pytest.mark.xfail(strict=True, raises=ValueError)
 def test_rf_nbins_small(small_clf):
-
     X, y = small_clf
     X = X.astype(np.float32)
     y = y.astype(np.int32)
@@ -1146,7 +1219,15 @@ def test_rf_nbins_small(small_clf):
     # Initialize, fit and predict using cuML's
     # random forest classification model
     cuml_model = curfc()
-    cuml_model.fit(X_train[0:3, :], y_train[0:3])
+
+    # display warning when nbins less than samples
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        cuml_model.fit(X_train[0:3, :], y_train[0:3])
+        assert("The number of bins, `n_bins` is greater than "
+               "the number of samples used for training. "
+               "Changing `n_bins` to number of training samples."
+               in str(w[-1].message))
 
 
 @pytest.mark.parametrize("split_criterion", [2], ids=["mse"])
@@ -1172,7 +1253,7 @@ def test_rf_regression_with_identical_labels(split_criterion):
     clf.fit(X, y)
     model_dump = json.loads(clf.get_json())
     assert len(model_dump) == 1
-    expected_dump = {"nodeid": 0, "leaf_value": 1.0, "instance_count": 5}
+    expected_dump = {"nodeid": 0, "leaf_value": [1.0], "instance_count": 5}
     assert model_dump[0] == expected_dump
 
 
@@ -1196,14 +1277,14 @@ def test_rf_binary_classifier_gtil_integration(tmpdir):
     X, y = X.astype(np.float32), y.astype(np.int32)
     clf = curfc(max_depth=3, random_state=0, n_estimators=10)
     clf.fit(X, y)
-    expected_prob = clf.predict_proba(X)[:, 1]
+    expected_pred = clf.predict(X)
 
     checkpoint_path = os.path.join(tmpdir, 'checkpoint.tl')
     clf.convert_to_treelite_model().to_treelite_checkpoint(checkpoint_path)
 
     tl_model = treelite.Model.deserialize(checkpoint_path)
-    out_prob = treelite.gtil.predict(tl_model, X)
-    np.testing.assert_almost_equal(out_prob, expected_prob, decimal=5)
+    out_pred = treelite.gtil.predict(tl_model, X)
+    np.testing.assert_almost_equal(out_pred, expected_pred, decimal=5)
 
 
 def test_rf_multiclass_classifier_gtil_integration(tmpdir):
diff --git a/python/cuml/test/test_text_feature_extraction.py b/python/cuml/test/test_text_feature_extraction.py
index 1ff751d316..bf09e320be 100644
--- a/python/cuml/test/test_text_feature_extraction.py
+++ b/python/cuml/test/test_text_feature_extraction.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -374,6 +374,20 @@ def test_tfidf_vectorizer(norm, use_idf, smooth_idf, sublinear_tf):
     cp.testing.assert_array_almost_equal(tfidf_mat.todense(), ref.toarray())
 
 
+def test_tfidf_vectorizer_get_feature_names():
+    corpus = [
+        'This is the first document.',
+        'This document is the second document.',
+        'And this is the third one.',
+        'Is this the first document?',
+    ]
+    vectorizer = TfidfVectorizer()
+    vectorizer.fit_transform(Series(corpus))
+    output = ['and', 'document', 'first', 'is',
+              'one', 'second', 'the', 'third', 'this']
+    assert vectorizer.get_feature_names().to_arrow().to_pylist() == output
+
+
 # ----------------------------------------------------------------
 # HashingVectorizer tests
 # ----------------------------------------------------------------
diff --git a/python/cuml/test/test_trustworthiness.py b/python/cuml/test/test_trustworthiness.py
index 7a2a117067..17a623bd3f 100644
--- a/python/cuml/test/test_trustworthiness.py
+++ b/python/cuml/test/test_trustworthiness.py
@@ -48,3 +48,11 @@ def test_trustworthiness(input_type, n_samples, n_features, n_components,
     cu_score = cuml_trustworthiness(X, X_embedded, batch_size=batch_size)
 
     assert abs(cu_score - sk_score) <= 1e-3
+
+
+def test_trustworthiness_invalid_input():
+    X, y = make_blobs(n_samples=10, centers=1,
+                      n_features=2, random_state=32)
+
+    with pytest.raises(ValueError):
+        cuml_trustworthiness(X, X, n_neighbors=50)
diff --git a/python/cuml/test/ts_datasets/README.md b/python/cuml/test/ts_datasets/README.md
index 77494769f7..6c1bbe3dba 100644
--- a/python/cuml/test/ts_datasets/README.md
+++ b/python/cuml/test/ts_datasets/README.md
@@ -18,3 +18,8 @@ This folder contains various datasets to test our time series analysis. Using da
 - `passenger_movements.csv`: Passenger movements (thousands), quarterly 1975-2019.
 - `police_recorded_crime.csv`: Recorded crimes (units) per year, 1878-2014.
 - `population_estimate.csv`: Population estimates (thousands) per year, 1875-2011.
+
+The following files are derived from the Stats NZ dataset by removing observations (to test support for missing observations):
+- `guest_nights_by_region_missing.csv`
+- `hourly_earnings_by_industry_missing.csv`
+- `population_estimate_missing.csv`
\ No newline at end of file
diff --git a/python/cuml/test/ts_datasets/guest_nights_by_region_missing.csv b/python/cuml/test/ts_datasets/guest_nights_by_region_missing.csv
new file mode 100644
index 0000000000..cb56443a2c
--- /dev/null
+++ b/python/cuml/test/ts_datasets/guest_nights_by_region_missing.csv
@@ -0,0 +1,280 @@
+,Northland,Auckland,Waikato,Bay of Plenty,"Hawke's Bay, Gisborne","Taranaki, Manawatu, Wanganui",Wellington,"Nelson, Marlborough, Tasman",Canterbury,West Coast,Otago,Southland
+0,66.0,257.0,124.0,159.0,49.0,93.0,111.0,52.0,209.0,27.0,175.0,21.0
+1,58.0,266.0,111.0,,45.0,89.0,105.0,54.0,210.0,28.0,211.0,23.0
+2,79.0,264.0,140.0,174.0,62.0,115.0,113.0,79.0,256.0,45.0,214.0,31.0
+3,,280.0,147.0,177.0,61.0,111.0,,79.0,256.0,57.0,194.0,41.0
+4,104.0,334.0,,199.0,66.0,112.0,,100.0,324.0,64.0,212.0,54.0
+5,185.0,337.0,219.0,257.0,87.0,111.0,,171.0,364.0,,262.0,66.0
+6,279.0,386.0,343.0,419.0,145.0,151.0,142.0,,440.0,96.0,360.0,76.0
+7,158.0,362.0,205.0,238.0,91.0,122.0,144.0,165.0,,,286.0,78.0
+8,,377.0,225.0,251.0,90.0,129.0,145.0,150.0,348.0,81.0,,73.0
+9,108.0,302.0,151.0,198.0,73.0,104.0,119.0,105.0,278.0,57.0,213.0,45.0
+10,73.0,,,,50.0,87.0,104.0,68.0,192.0,32.0,138.0,29.0
+11,55.0,224.0,102.0,119.0,45.0,,92.0,56.0,159.0,23.0,,19.0
+12,70.0,259.0,137.0,164.0,50.0,101.0,102.0,63.0,213.0,27.0,177.0,24.0
+13,62.0,266.0,113.0,134.0,48.0,100.0,95.0,63.0,202.0,26.0,215.0,22.0
+14,70.0,239.0,,149.0,52.0,106.0,,67.0,211.0,,182.0,27.0
+15,102.0,292.0,172.0,210.0,,132.0,127.0,,288.0,55.0,199.0,46.0
+16,105.0,303.0,152.0,188.0,64.0,,125.0,101.0,300.0,,208.0,54.0
+17,218.0,301.0,249.0,258.0,,104.0,,179.0,358.0,71.0,244.0,
+18,380.0,340.0,381.0,,155.0,147.0,137.0,303.0,447.0,92.0,360.0,85.0
+19,158.0,349.0,,252.0,92.0,115.0,142.0,178.0,367.0,83.0,259.0,77.0
+20,,339.0,193.0,205.0,82.0,,147.0,147.0,327.0,73.0,246.0,62.0
+21,130.0,291.0,193.0,216.0,83.0,113.0,135.0,123.0,301.0,64.0,246.0,54.0
+22,75.0,255.0,129.0,148.0,54.0,85.0,119.0,74.0,189.0,36.0,136.0,34.0
+23,,224.0,108.0,118.0,39.0,69.0,,53.0,145.0,23.0,104.0,19.0
+24,60.0,267.0,133.0,170.0,51.0,83.0,137.0,66.0,206.0,30.0,192.0,22.0
+25,58.0,251.0,115.0,140.0,,81.0,,57.0,,,223.0,
+26,66.0,253.0,139.0,160.0,56.0,,123.0,72.0,211.0,42.0,208.0,31.0
+27,99.0,,170.0,218.0,77.0,113.0,146.0,95.0,292.0,62.0,206.0,44.0
+28,107.0,330.0,154.0,191.0,65.0,97.0,136.0,109.0,303.0,70.0,227.0,57.0
+29,187.0,312.0,250.0,270.0,90.0,99.0,134.0,189.0,344.0,86.0,271.0,60.0
+30,280.0,367.0,381.0,452.0,149.0,138.0,163.0,328.0,454.0,116.0,432.0,87.0
+31,145.0,360.0,,234.0,88.0,111.0,146.0,182.0,,97.0,294.0,82.0
+32,129.0,379.0,211.0,208.0,84.0,115.0,157.0,,333.0,85.0,273.0,79.0
+33,,309.0,187.0,255.0,81.0,115.0,153.0,129.0,317.0,71.0,244.0,58.0
+34,66.0,266.0,117.0,150.0,46.0,76.0,120.0,69.0,195.0,35.0,141.0,32.0
+35,,253.0,136.0,156.0,51.0,84.0,127.0,67.0,190.0,34.0,126.0,25.0
+36,61.0,302.0,140.0,176.0,54.0,88.0,134.0,62.0,216.0,33.0,225.0,25.0
+37,59.0,281.0,,146.0,,98.0,114.0,62.0,219.0,29.0,249.0,
+38,81.0,,158.0,195.0,62.0,130.0,,85.0,279.0,50.0,256.0,
+39,99.0,335.0,178.0,205.0,70.0,103.0,138.0,94.0,302.0,58.0,216.0,44.0
+40,110.0,396.0,170.0,209.0,72.0,96.0,144.0,120.0,338.0,76.0,254.0,60.0
+41,176.0,368.0,251.0,282.0,94.0,102.0,144.0,181.0,340.0,84.0,282.0,
+42,,459.0,405.0,469.0,157.0,,,316.0,430.0,118.0,,82.0
+43,149.0,436.0,241.0,251.0,95.0,114.0,159.0,192.0,383.0,109.0,,86.0
+44,138.0,,220.0,230.0,89.0,116.0,180.0,166.0,362.0,98.0,292.0,74.0
+45,136.0,355.0,223.0,246.0,92.0,112.0,162.0,145.0,357.0,89.0,302.0,65.0
+46,67.0,291.0,124.0,146.0,48.0,81.0,111.0,,216.0,39.0,163.0,35.0
+47,56.0,281.0,134.0,145.0,49.0,80.0,110.0,67.0,,,141.0,26.0
+48,60.0,317.0,157.0,183.0,58.0,106.0,131.0,77.0,248.0,43.0,257.0,28.0
+49,58.0,288.0,,156.0,50.0,108.0,138.0,72.0,227.0,38.0,,28.0
+50,74.0,284.0,167.0,174.0,63.0,119.0,137.0,86.0,254.0,49.0,261.0,37.0
+51,102.0,348.0,192.0,196.0,77.0,111.0,146.0,,314.0,67.0,240.0,47.0
+52,119.0,,188.0,201.0,75.0,106.0,170.0,124.0,361.0,89.0,283.0,
+53,190.0,401.0,,296.0,107.0,113.0,163.0,204.0,408.0,101.0,348.0,74.0
+54,289.0,483.0,,499.0,170.0,,185.0,342.0,501.0,138.0,,93.0
+55,152.0,409.0,235.0,259.0,108.0,118.0,177.0,210.0,412.0,117.0,360.0,89.0
+56,144.0,433.0,240.0,259.0,104.0,116.0,187.0,180.0,413.0,115.0,338.0,87.0
+57,124.0,354.0,203.0,249.0,95.0,114.0,,143.0,358.0,87.0,296.0,63.0
+58,,317.0,126.0,153.0,53.0,84.0,127.0,,231.0,51.0,188.0,42.0
+59,59.0,298.0,140.0,162.0,54.0,86.0,130.0,71.0,,40.0,155.0,27.0
+60,65.0,340.0,160.0,197.0,65.0,112.0,146.0,82.0,266.0,46.0,291.0,32.0
+61,63.0,339.0,142.0,159.0,53.0,116.0,132.0,80.0,278.0,43.0,322.0,
+62,76.0,335.0,163.0,184.0,70.0,126.0,152.0,,290.0,57.0,276.0,39.0
+63,101.0,380.0,196.0,214.0,80.0,118.0,153.0,113.0,332.0,,257.0,54.0
+64,121.0,,195.0,203.0,81.0,109.0,174.0,135.0,367.0,91.0,272.0,69.0
+65,210.0,437.0,280.0,306.0,118.0,120.0,174.0,216.0,427.0,,361.0,78.0
+66,314.0,521.0,400.0,490.0,,159.0,192.0,329.0,516.0,133.0,501.0,98.0
+67,167.0,,251.0,270.0,115.0,,,,,115.0,370.0,95.0
+68,179.0,507.0,274.0,301.0,125.0,153.0,206.0,208.0,511.0,121.0,385.0,98.0
+69,124.0,402.0,209.0,227.0,87.0,104.0,161.0,,350.0,87.0,302.0,75.0
+70,81.0,364.0,149.0,174.0,,91.0,138.0,83.0,,51.0,194.0,45.0
+71,,333.0,150.0,,59.0,91.0,134.0,74.0,222.0,41.0,172.0,32.0
+72,,370.0,175.0,201.0,68.0,119.0,158.0,77.0,297.0,51.0,,36.0
+73,68.0,369.0,154.0,168.0,,124.0,,77.0,277.0,43.0,317.0,34.0
+74,80.0,367.0,168.0,187.0,75.0,133.0,147.0,99.0,310.0,58.0,264.0,42.0
+75,104.0,437.0,199.0,213.0,82.0,135.0,170.0,112.0,367.0,79.0,251.0,57.0
+76,120.0,,210.0,222.0,81.0,115.0,181.0,140.0,408.0,101.0,299.0,80.0
+77,204.0,479.0,287.0,,120.0,120.0,168.0,230.0,447.0,113.0,363.0,85.0
+78,318.0,,393.0,,,165.0,191.0,340.0,547.0,147.0,486.0,104.0
+79,,503.0,279.0,269.0,,136.0,204.0,227.0,482.0,131.0,,
+80,,510.0,267.0,245.0,111.0,,200.0,201.0,462.0,131.0,369.0,99.0
+81,148.0,411.0,252.0,,,136.0,176.0,169.0,379.0,109.0,322.0,
+82,85.0,356.0,158.0,167.0,67.0,101.0,156.0,93.0,248.0,57.0,190.0,
+83,64.0,307.0,142.0,,59.0,83.0,142.0,,214.0,42.0,159.0,31.0
+84,75.0,370.0,181.0,,75.0,131.0,154.0,81.0,286.0,53.0,320.0,39.0
+85,70.0,366.0,157.0,169.0,65.0,126.0,142.0,82.0,269.0,48.0,301.0,36.0
+86,90.0,367.0,177.0,205.0,76.0,140.0,165.0,93.0,315.0,63.0,287.0,47.0
+87,110.0,406.0,195.0,227.0,89.0,133.0,172.0,114.0,375.0,82.0,260.0,60.0
+88,123.0,472.0,214.0,227.0,89.0,113.0,189.0,139.0,413.0,,309.0,83.0
+89,214.0,486.0,,309.0,142.0,131.0,182.0,,482.0,130.0,397.0,98.0
+90,322.0,556.0,406.0,502.0,223.0,,208.0,331.0,,165.0,514.0,111.0
+91,177.0,492.0,273.0,281.0,135.0,144.0,214.0,219.0,513.0,149.0,399.0,112.0
+92,159.0,515.0,266.0,,126.0,146.0,212.0,197.0,504.0,,368.0,103.0
+93,142.0,,242.0,268.0,110.0,132.0,182.0,156.0,433.0,117.0,341.0,84.0
+94,84.0,382.0,148.0,176.0,69.0,98.0,150.0,88.0,290.0,60.0,204.0,43.0
+95,77.0,350.0,165.0,,65.0,102.0,145.0,77.0,256.0,46.0,203.0,32.0
+96,85.0,387.0,187.0,,73.0,158.0,171.0,84.0,319.0,53.0,321.0,37.0
+97,71.0,371.0,,185.0,65.0,136.0,145.0,78.0,293.0,,324.0,37.0
+98,90.0,380.0,186.0,216.0,78.0,154.0,167.0,98.0,355.0,77.0,317.0,47.0
+99,109.0,434.0,,230.0,86.0,141.0,169.0,107.0,383.0,89.0,,59.0
+100,130.0,467.0,219.0,240.0,92.0,133.0,193.0,150.0,459.0,118.0,326.0,87.0
+101,218.0,479.0,297.0,319.0,,138.0,194.0,228.0,498.0,130.0,386.0,99.0
+102,329.0,569.0,415.0,535.0,228.0,186.0,218.0,358.0,635.0,167.0,518.0,121.0
+103,172.0,,269.0,288.0,128.0,,220.0,223.0,522.0,152.0,414.0,117.0
+104,187.0,531.0,301.0,312.0,134.0,166.0,236.0,217.0,557.0,156.0,421.0,117.0
+105,137.0,449.0,248.0,256.0,103.0,134.0,203.0,145.0,425.0,108.0,,80.0
+106,83.0,369.0,,181.0,67.0,,158.0,,274.0,63.0,218.0,42.0
+107,73.0,351.0,163.0,183.0,,115.0,166.0,80.0,302.0,55.0,221.0,35.0
+108,84.0,415.0,193.0,222.0,73.0,147.0,191.0,82.0,310.0,57.0,337.0,36.0
+109,73.0,361.0,157.0,173.0,66.0,147.0,158.0,,282.0,50.0,336.0,38.0
+110,87.0,383.0,185.0,197.0,79.0,151.0,,91.0,325.0,66.0,327.0,44.0
+111,111.0,412.0,209.0,230.0,102.0,145.0,187.0,116.0,387.0,94.0,283.0,58.0
+112,127.0,463.0,212.0,237.0,,129.0,205.0,148.0,,117.0,338.0,
+113,195.0,457.0,,,141.0,142.0,195.0,217.0,469.0,126.0,400.0,88.0
+114,312.0,526.0,395.0,488.0,225.0,194.0,228.0,,,169.0,550.0,119.0
+115,184.0,483.0,274.0,301.0,,150.0,234.0,241.0,524.0,152.0,434.0,
+116,170.0,,,274.0,133.0,,242.0,203.0,508.0,144.0,404.0,
+117,149.0,430.0,244.0,265.0,113.0,127.0,209.0,157.0,,118.0,372.0,83.0
+118,82.0,365.0,148.0,177.0,65.0,,170.0,86.0,289.0,63.0,229.0,44.0
+119,72.0,346.0,162.0,,67.0,103.0,158.0,75.0,249.0,51.0,,31.0
+120,84.0,356.0,189.0,204.0,,154.0,188.0,81.0,306.0,58.0,332.0,35.0
+121,73.0,,158.0,,74.0,150.0,168.0,78.0,281.0,54.0,,
+122,,393.0,188.0,208.0,,169.0,190.0,92.0,341.0,69.0,312.0,43.0
+123,126.0,,203.0,,102.0,,206.0,121.0,395.0,96.0,294.0,62.0
+124,135.0,488.0,240.0,242.0,98.0,123.0,,153.0,445.0,118.0,343.0,85.0
+125,,477.0,282.0,340.0,153.0,139.0,208.0,,,132.0,412.0,91.0
+126,357.0,557.0,402.0,506.0,231.0,189.0,,364.0,605.0,173.0,555.0,115.0
+127,199.0,515.0,295.0,313.0,134.0,172.0,260.0,246.0,544.0,169.0,,118.0
+128,179.0,,294.0,304.0,138.0,163.0,264.0,213.0,517.0,147.0,443.0,115.0
+129,141.0,,246.0,281.0,117.0,,215.0,167.0,447.0,115.0,,81.0
+130,89.0,377.0,162.0,193.0,75.0,102.0,180.0,99.0,293.0,68.0,238.0,48.0
+131,,373.0,161.0,178.0,72.0,104.0,171.0,85.0,272.0,55.0,213.0,
+132,78.0,413.0,185.0,210.0,77.0,146.0,184.0,86.0,336.0,64.0,373.0,36.0
+133,75.0,,171.0,174.0,74.0,150.0,177.0,89.0,321.0,,371.0,37.0
+134,92.0,426.0,195.0,,81.0,156.0,202.0,94.0,352.0,74.0,322.0,45.0
+135,114.0,451.0,198.0,239.0,103.0,131.0,204.0,117.0,406.0,91.0,297.0,59.0
+136,126.0,537.0,210.0,252.0,96.0,133.0,228.0,152.0,472.0,116.0,346.0,84.0
+137,210.0,480.0,262.0,349.0,151.0,143.0,219.0,232.0,510.0,132.0,407.0,
+138,,588.0,400.0,529.0,216.0,196.0,253.0,,649.0,173.0,,112.0
+139,,557.0,281.0,336.0,145.0,163.0,277.0,265.0,562.0,168.0,466.0,122.0
+140,190.0,564.0,289.0,319.0,152.0,177.0,295.0,244.0,593.0,163.0,471.0,116.0
+141,132.0,472.0,221.0,256.0,104.0,130.0,233.0,157.0,,114.0,352.0,80.0
+142,89.0,416.0,165.0,193.0,75.0,107.0,202.0,100.0,315.0,68.0,236.0,50.0
+143,,372.0,140.0,164.0,58.0,,174.0,84.0,270.0,48.0,194.0,31.0
+144,69.0,396.0,165.0,203.0,71.0,149.0,194.0,83.0,347.0,59.0,366.0,37.0
+145,58.0,415.0,161.0,,61.0,149.0,172.0,80.0,316.0,,353.0,35.0
+146,83.0,413.0,176.0,189.0,78.0,151.0,204.0,93.0,349.0,,298.0,45.0
+147,112.0,477.0,215.0,238.0,105.0,152.0,237.0,117.0,416.0,97.0,288.0,59.0
+148,122.0,506.0,210.0,220.0,100.0,,235.0,,458.0,114.0,319.0,83.0
+149,192.0,486.0,278.0,325.0,148.0,137.0,212.0,226.0,491.0,126.0,391.0,92.0
+150,313.0,555.0,394.0,510.0,212.0,204.0,240.0,362.0,616.0,170.0,561.0,115.0
+151,182.0,515.0,270.0,300.0,141.0,155.0,250.0,250.0,,150.0,420.0,
+152,165.0,534.0,269.0,270.0,141.0,155.0,270.0,,531.0,138.0,414.0,
+153,151.0,443.0,,277.0,124.0,143.0,238.0,165.0,453.0,113.0,357.0,80.0
+154,91.0,382.0,161.0,191.0,,,206.0,103.0,318.0,75.0,239.0,49.0
+155,,333.0,141.0,145.0,56.0,98.0,169.0,73.0,252.0,50.0,211.0,29.0
+156,,421.0,175.0,195.0,72.0,159.0,193.0,82.0,,61.0,396.0,36.0
+157,65.0,383.0,156.0,162.0,65.0,146.0,174.0,81.0,327.0,56.0,374.0,
+158,85.0,413.0,181.0,199.0,79.0,156.0,211.0,91.0,360.0,73.0,317.0,
+159,116.0,482.0,211.0,233.0,99.0,139.0,236.0,121.0,426.0,97.0,300.0,65.0
+160,126.0,488.0,211.0,223.0,98.0,120.0,232.0,146.0,460.0,116.0,342.0,89.0
+161,,491.0,,348.0,136.0,148.0,,235.0,516.0,,430.0,96.0
+162,332.0,568.0,435.0,522.0,219.0,193.0,263.0,393.0,647.0,170.0,579.0,118.0
+163,183.0,524.0,273.0,,135.0,146.0,267.0,244.0,536.0,153.0,441.0,
+164,165.0,539.0,270.0,290.0,135.0,159.0,267.0,,542.0,,441.0,110.0
+165,146.0,468.0,237.0,276.0,,139.0,222.0,163.0,,112.0,386.0,83.0
+166,87.0,,148.0,183.0,66.0,,184.0,86.0,291.0,62.0,227.0,44.0
+167,66.0,356.0,147.0,180.0,61.0,101.0,170.0,70.0,265.0,47.0,234.0,33.0
+168,72.0,,172.0,218.0,70.0,144.0,190.0,77.0,350.0,59.0,408.0,39.0
+169,64.0,427.0,143.0,172.0,62.0,,,78.0,324.0,50.0,,33.0
+170,78.0,438.0,165.0,206.0,73.0,137.0,205.0,90.0,361.0,64.0,324.0,42.0
+171,120.0,491.0,215.0,238.0,96.0,138.0,,114.0,408.0,88.0,292.0,56.0
+172,118.0,545.0,219.0,242.0,95.0,130.0,224.0,145.0,465.0,109.0,330.0,80.0
+173,202.0,515.0,277.0,,132.0,137.0,226.0,228.0,498.0,120.0,420.0,89.0
+174,327.0,611.0,418.0,485.0,204.0,187.0,249.0,381.0,641.0,163.0,558.0,
+175,172.0,,273.0,,,152.0,265.0,238.0,461.0,152.0,465.0,109.0
+176,160.0,582.0,269.0,280.0,123.0,165.0,279.0,204.0,377.0,,,
+177,150.0,508.0,253.0,272.0,104.0,141.0,232.0,161.0,337.0,,,74.0
+178,,459.0,146.0,188.0,67.0,98.0,197.0,93.0,244.0,56.0,,44.0
+179,67.0,401.0,154.0,181.0,62.0,,187.0,80.0,219.0,49.0,210.0,32.0
+180,82.0,485.0,181.0,231.0,63.0,138.0,220.0,90.0,280.0,54.0,399.0,36.0
+181,70.0,492.0,163.0,214.0,,134.0,217.0,93.0,269.0,,393.0,33.0
+182,85.0,,183.0,207.0,86.0,134.0,221.0,91.0,265.0,59.0,323.0,48.0
+183,108.0,557.0,221.0,230.0,99.0,142.0,238.0,128.0,311.0,78.0,,48.0
+184,123.0,536.0,227.0,246.0,98.0,127.0,243.0,149.0,,,358.0,77.0
+185,196.0,578.0,290.0,321.0,133.0,143.0,220.0,220.0,427.0,123.0,,87.0
+186,321.0,612.0,,437.0,186.0,197.0,,354.0,519.0,161.0,608.0,109.0
+187,180.0,582.0,,280.0,136.0,166.0,271.0,,402.0,140.0,436.0,97.0
+188,142.0,620.0,274.0,261.0,123.0,175.0,266.0,195.0,392.0,123.0,414.0,86.0
+189,126.0,537.0,240.0,263.0,90.0,137.0,209.0,151.0,343.0,103.0,,69.0
+190,80.0,458.0,,172.0,66.0,,188.0,92.0,242.0,57.0,237.0,
+191,67.0,453.0,,176.0,61.0,98.0,176.0,80.0,225.0,44.0,230.0,32.0
+192,71.0,448.0,169.0,191.0,60.0,136.0,189.0,84.0,271.0,57.0,398.0,35.0
+193,60.0,,146.0,169.0,54.0,118.0,172.0,76.0,262.0,46.0,378.0,
+194,79.0,470.0,173.0,193.0,68.0,,213.0,91.0,,59.0,311.0,39.0
+195,112.0,574.0,213.0,235.0,97.0,133.0,236.0,123.0,348.0,86.0,317.0,53.0
+196,117.0,595.0,219.0,224.0,98.0,125.0,247.0,138.0,,100.0,330.0,70.0
+197,,,290.0,311.0,133.0,141.0,247.0,221.0,435.0,130.0,492.0,82.0
+198,315.0,636.0,429.0,,180.0,183.0,,347.0,528.0,135.0,577.0,105.0
+199,168.0,604.0,,310.0,128.0,148.0,,234.0,420.0,142.0,462.0,104.0
+200,170.0,655.0,315.0,312.0,138.0,179.0,,,,130.0,476.0,96.0
+201,119.0,546.0,,247.0,95.0,122.0,223.0,142.0,353.0,97.0,388.0,69.0
+202,83.0,497.0,158.0,192.0,70.0,101.0,216.0,103.0,,56.0,,42.0
+203,69.0,442.0,151.0,,58.0,,195.0,82.0,246.0,43.0,262.0,30.0
+204,70.0,489.0,186.0,212.0,68.0,,,85.0,,53.0,427.0,34.0
+205,,490.0,,188.0,57.0,,189.0,,273.0,48.0,420.0,33.0
+206,77.0,522.0,185.0,215.0,72.0,139.0,,92.0,298.0,57.0,323.0,40.0
+207,,571.0,223.0,242.0,96.0,138.0,239.0,116.0,370.0,76.0,330.0,55.0
+208,116.0,616.0,227.0,,96.0,142.0,243.0,151.0,405.0,107.0,366.0,81.0
+209,206.0,601.0,,341.0,133.0,157.0,241.0,224.0,446.0,136.0,491.0,97.0
+210,345.0,,436.0,495.0,185.0,210.0,256.0,364.0,,159.0,603.0,115.0
+211,177.0,652.0,305.0,,126.0,160.0,264.0,259.0,478.0,153.0,,103.0
+212,149.0,639.0,289.0,283.0,125.0,166.0,265.0,215.0,459.0,133.0,476.0,105.0
+213,155.0,566.0,269.0,297.0,109.0,135.0,,167.0,,115.0,,82.0
+214,97.0,538.0,182.0,210.0,71.0,109.0,,107.0,303.0,59.0,279.0,48.0
+215,69.0,473.0,160.0,168.0,57.0,90.0,182.0,78.0,,,,31.0
+216,75.0,505.0,188.0,221.0,69.0,138.0,208.0,88.0,315.0,57.0,469.0,40.0
+217,70.0,525.0,167.0,188.0,60.0,124.0,200.0,77.0,290.0,51.0,420.0,38.0
+218,85.0,557.0,191.0,232.0,75.0,141.0,224.0,91.0,300.0,70.0,339.0,48.0
+219,120.0,595.0,,272.0,,143.0,254.0,121.0,385.0,93.0,369.0,67.0
+220,133.0,641.0,239.0,265.0,102.0,136.0,,157.0,416.0,118.0,420.0,91.0
+221,230.0,,326.0,,137.0,169.0,248.0,233.0,498.0,151.0,538.0,112.0
+222,363.0,689.0,467.0,488.0,185.0,207.0,276.0,384.0,583.0,178.0,628.0,118.0
+223,200.0,660.0,323.0,325.0,132.0,169.0,274.0,271.0,510.0,170.0,554.0,116.0
+224,164.0,676.0,311.0,309.0,141.0,170.0,274.0,221.0,505.0,151.0,512.0,119.0
+225,148.0,599.0,280.0,303.0,,141.0,265.0,163.0,428.0,117.0,467.0,90.0
+226,91.0,536.0,190.0,229.0,79.0,118.0,231.0,,317.0,69.0,321.0,52.0
+227,66.0,470.0,160.0,179.0,64.0,107.0,187.0,80.0,258.0,44.0,286.0,34.0
+228,,518.0,207.0,238.0,70.0,148.0,227.0,,329.0,58.0,460.0,43.0
+229,66.0,537.0,177.0,195.0,60.0,141.0,204.0,91.0,301.0,52.0,434.0,42.0
+230,95.0,562.0,,,72.0,144.0,222.0,99.0,,74.0,378.0,
+231,124.0,588.0,251.0,281.0,98.0,147.0,263.0,130.0,398.0,95.0,391.0,69.0
+232,132.0,633.0,256.0,290.0,97.0,143.0,271.0,,436.0,,466.0,97.0
+233,231.0,650.0,352.0,381.0,145.0,171.0,,252.0,546.0,149.0,,113.0
+234,,715.0,469.0,514.0,200.0,216.0,300.0,390.0,,188.0,710.0,141.0
+235,201.0,679.0,345.0,354.0,147.0,,303.0,,544.0,183.0,595.0,136.0
+236,213.0,,351.0,363.0,150.0,193.0,297.0,,549.0,172.0,606.0,148.0
+237,144.0,634.0,273.0,313.0,127.0,149.0,265.0,173.0,428.0,,503.0,106.0
+238,89.0,560.0,177.0,226.0,83.0,105.0,216.0,112.0,300.0,67.0,337.0,58.0
+239,78.0,509.0,192.0,225.0,74.0,108.0,209.0,90.0,266.0,,319.0,40.0
+240,92.0,561.0,217.0,256.0,86.0,145.0,236.0,107.0,345.0,60.0,485.0,45.0
+241,83.0,567.0,198.0,,73.0,143.0,218.0,101.0,302.0,54.0,443.0,
+242,100.0,575.0,229.0,264.0,86.0,,239.0,113.0,345.0,76.0,402.0,
+243,131.0,641.0,275.0,294.0,108.0,146.0,270.0,138.0,419.0,102.0,409.0,71.0
+244,148.0,642.0,288.0,307.0,,146.0,259.0,178.0,442.0,133.0,,110.0
+245,241.0,662.0,383.0,410.0,161.0,,275.0,265.0,509.0,164.0,603.0,122.0
+246,364.0,695.0,497.0,541.0,,222.0,297.0,410.0,598.0,,715.0,146.0
+247,222.0,663.0,357.0,360.0,,190.0,286.0,279.0,525.0,173.0,592.0,
+248,188.0,694.0,341.0,356.0,154.0,191.0,304.0,,499.0,167.0,583.0,138.0
+249,168.0,645.0,309.0,346.0,142.0,162.0,265.0,192.0,,131.0,545.0,114.0
+250,107.0,578.0,,246.0,96.0,120.0,229.0,120.0,326.0,,353.0,
+251,83.0,519.0,202.0,243.0,,122.0,230.0,103.0,289.0,50.0,340.0,44.0
+252,91.0,561.0,215.0,270.0,89.0,153.0,238.0,106.0,354.0,,489.0,46.0
+253,83.0,537.0,187.0,226.0,79.0,138.0,204.0,114.0,316.0,62.0,449.0,45.0
+254,,,220.0,268.0,,160.0,237.0,121.0,361.0,80.0,423.0,57.0
+255,146.0,601.0,283.0,321.0,,168.0,271.0,154.0,,,439.0,85.0
+256,,658.0,300.0,324.0,,158.0,274.0,187.0,475.0,137.0,501.0,117.0
+257,253.0,661.0,396.0,424.0,168.0,188.0,279.0,264.0,572.0,165.0,,138.0
+258,369.0,706.0,501.0,,211.0,223.0,302.0,379.0,661.0,,723.0,155.0
+259,213.0,663.0,356.0,365.0,141.0,181.0,290.0,274.0,590.0,174.0,630.0,153.0
+260,213.0,705.0,371.0,400.0,148.0,205.0,315.0,265.0,596.0,175.0,630.0,154.0
+261,154.0,598.0,287.0,332.0,126.0,159.0,272.0,184.0,507.0,133.0,558.0,121.0
+262,109.0,558.0,198.0,252.0,95.0,127.0,231.0,117.0,345.0,72.0,369.0,67.0
+263,82.0,489.0,194.0,238.0,90.0,123.0,220.0,91.0,301.0,56.0,338.0,47.0
+264,91.0,548.0,223.0,251.0,89.0,169.0,235.0,101.0,375.0,61.0,483.0,52.0
+265,87.0,569.0,203.0,223.0,85.0,149.0,217.0,93.0,345.0,56.0,451.0,47.0
+266,97.0,585.0,228.0,264.0,98.0,160.0,253.0,114.0,371.0,74.0,439.0,64.0
+267,143.0,644.0,287.0,320.0,121.0,177.0,276.0,166.0,467.0,106.0,473.0,87.0
+268,155.0,693.0,292.0,322.0,122.0,170.0,278.0,180.0,508.0,140.0,552.0,123.0
+269,254.0,669.0,403.0,409.0,156.0,194.0,287.0,271.0,600.0,163.0,650.0,144.0
+270,366.0,726.0,530.0,519.0,213.0,226.0,306.0,376.0,669.0,176.0,708.0,155.0
+271,207.0,664.0,352.0,355.0,145.0,191.0,296.0,278.0,592.0,171.0,610.0,153.0
+272,192.0,699.0,367.0,375.0,159.0,202.0,321.0,234.0,570.0,150.0,594.0,145.0
+273,180.0,616.0,319.0,372.0,141.0,175.0,287.0,198.0,525.0,117.0,556.0,119.0
+274,109.0,570.0,206.0,256.0,102.0,125.0,243.0,116.0,345.0,70.0,356.0,66.0
+275,82.0,508.0,200.0,243.0,91.0,122.0,228.0,96.0,301.0,52.0,330.0,46.0
+276,91.0,559.0,222.0,270.0,97.0,163.0,256.0,107.0,354.0,55.0,472.0,53.0
+277,91.0,589.0,204.0,231.0,91.0,151.0,242.0,102.0,348.0,53.0,465.0,51.0
+278,97.0,619.0,233.0,260.0,105.0,172.0,239.0,116.0,369.0,65.0,430.0,64.0
diff --git a/python/cuml/test/ts_datasets/hourly_earnings_by_industry_missing.csv b/python/cuml/test/ts_datasets/hourly_earnings_by_industry_missing.csv
new file mode 100644
index 0000000000..6f2d6b9686
--- /dev/null
+++ b/python/cuml/test/ts_datasets/hourly_earnings_by_industry_missing.csv
@@ -0,0 +1,124 @@
+,Forestry and Mining,Manufacturing,"Electricity, Gas, Water and Waste Services",Construction,Wholesale Trade,Retail Trade,Accommodation and Food Services,"Transport, Postal and Warehousing",Information Media and Telecommunications,Financial and Insurance Services,"Rental, Hiring and Real Estate Services","Professional, Scientific, Technical, Administrative and Support Services",Public Administration and Safety,Health Care and Social Assistance
+0,13.65,12.11,13.65,11.38,13.44,9.5,9.71,12.35,17.14,13.83,12.61,14.79,15.19,13.68
+1,13.77,12.09,,11.54,13.6,9.48,9.74,12.65,17.35,14.31,12.7,14.93,,
+2,13.77,,14.32,11.72,13.77,9.56,9.85,12.84,17.56,14.52,,,15.66,13.68
+3,14.03,,14.44,11.85,,9.64,9.74,,,14.95,13.46,15.81,16.04,13.82
+4,14.14,12.74,14.68,,14.54,10.04,10.15,13.37,18.29,15.25,13.85,,16.17,14.24
+5,,,,12.08,14.7,9.99,,13.82,18.65,15.57,13.55,15.95,16.26,14.35
+6,14.3,12.82,,12.4,14.94,10.2,10.41,14.33,19.06,15.86,13.73,16.17,16.43,14.27
+7,14.67,12.84,15.22,,,10.21,10.38,14.36,19.23,16.09,13.84,16.31,16.85,14.57
+8,14.54,13.21,15.22,12.6,15.2,10.48,,14.33,19.29,16.11,13.88,,16.86,
+9,14.89,13.41,15.39,12.54,15.28,10.35,10.59,14.48,19.55,16.25,13.92,16.41,17.07,14.7
+10,15.34,,15.57,12.7,15.53,10.55,10.66,14.67,,16.48,,16.49,,14.92
+11,15.82,13.34,15.54,12.7,15.59,10.56,10.69,14.82,19.91,16.64,14.09,16.65,,14.94
+12,15.36,13.58,15.63,12.79,15.63,10.59,10.67,14.92,19.88,,14.11,16.69,17.31,
+13,,,15.63,12.71,,10.55,10.64,14.99,20.1,,14.2,,17.46,
+14,,13.39,15.62,12.7,15.83,10.62,10.64,,19.95,,14.24,16.83,17.57,15.15
+15,,13.46,,,16.05,10.51,,14.91,20.01,17.54,14.49,,17.67,15.3
+16,14.91,13.69,15.4,12.66,,10.64,,,19.85,17.58,14.25,16.84,17.47,15.38
+17,14.79,13.71,15.54,12.65,16.12,10.65,10.75,14.87,20.01,17.86,14.48,17.11,17.6,15.27
+18,14.73,13.47,15.49,,16.13,10.66,10.69,14.82,19.83,,14.48,17.12,17.68,15.23
+19,14.94,13.56,,12.72,16.26,10.6,10.73,14.91,19.99,,14.53,,17.59,15.24
+20,15.15,13.72,15.97,12.91,16.14,10.75,10.75,14.96,19.95,18.15,14.63,17.28,17.62,15.53
+21,14.76,13.91,16.15,,16.4,10.79,10.8,15.13,20.26,18.64,14.73,17.41,17.72,15.38
+22,14.79,13.83,,13.06,16.57,10.88,10.88,14.92,20.2,,14.9,17.59,17.94,15.01
+23,15.34,13.8,16.63,13.11,16.35,10.82,,15.01,20.26,18.9,14.9,17.63,17.93,15.76
+24,,14.06,16.62,,,,10.91,15.06,20.23,18.76,14.84,17.52,17.95,15.83
+25,15.86,,,13.35,16.75,10.96,11.0,15.01,,19.65,,17.86,18.07,15.75
+26,,14.18,17.01,13.5,16.74,,11.06,15.13,,19.93,15.15,17.91,,15.95
+27,,14.31,17.09,13.54,16.84,11.11,10.96,,,20.24,15.28,18.07,18.36,16.25
+28,16.18,14.61,,13.76,17.03,11.23,,15.39,20.69,,15.37,18.14,18.4,16.32
+29,16.65,14.71,17.48,13.89,17.23,,11.23,15.32,20.96,20.62,15.78,18.63,,16.37
+30,16.62,14.69,17.72,14.06,17.31,11.44,11.24,15.42,20.97,,15.9,18.78,18.71,16.49
+31,,14.82,,14.13,17.61,,11.55,,21.55,20.98,16.12,19.04,,16.69
+32,16.84,15.1,18.28,14.28,17.88,11.54,11.54,16.05,21.64,21.0,16.18,19.08,19.22,
+33,16.98,15.26,18.57,14.27,17.96,11.64,11.58,,21.9,,16.41,,19.17,17.01
+34,17.34,15.21,19.17,14.44,18.11,11.77,11.66,,21.94,21.88,16.62,19.62,19.11,17.26
+35,,,19.21,14.57,,11.82,11.68,16.23,22.1,21.77,16.59,19.6,19.2,17.3
+36,18.32,15.52,19.2,14.65,18.25,11.96,11.88,16.19,,21.73,16.51,19.46,19.36,17.41
+37,18.26,15.72,19.18,14.9,18.46,12.1,11.8,16.3,22.4,22.04,16.71,19.76,19.6,17.43
+38,18.26,,,15.02,18.6,12.1,11.79,16.43,22.58,22.64,,19.88,19.6,17.47
+39,,15.88,19.89,15.19,,11.97,11.86,16.46,22.68,22.96,17.15,20.25,20.12,17.72
+40,,16.07,19.8,15.11,19.01,12.0,11.76,16.31,22.88,,17.21,20.33,20.39,17.79
+41,18.84,16.16,,15.33,18.89,,11.83,16.19,22.84,23.16,17.5,20.65,20.34,17.55
+42,18.22,16.16,19.69,15.67,,,,16.73,23.1,23.14,,20.92,20.76,17.88
+43,17.55,,21.19,15.75,19.02,12.1,11.66,16.62,23.38,23.08,17.28,20.37,20.93,17.87
+44,17.03,16.56,21.24,15.67,19.25,12.25,11.59,16.51,23.26,,17.31,20.38,20.51,17.98
+45,,,21.72,16.07,19.13,,11.95,16.11,23.21,23.84,17.55,20.66,20.78,17.72
+46,18.5,16.6,21.85,16.24,19.31,12.31,12.02,16.68,23.56,24.16,17.82,20.94,21.29,
+47,17.89,16.78,21.74,16.35,19.06,,12.07,16.8,,,,20.88,21.53,17.94
+48,17.75,17.05,,16.48,19.35,12.54,12.26,16.91,23.91,,17.99,21.19,,18.25
+49,18.19,17.04,,16.26,19.88,12.7,12.27,16.78,23.96,24.85,18.39,21.59,,18.24
+50,,17.23,23.05,16.51,20.1,12.78,,17.06,24.19,24.71,,21.79,22.09,18.72
+51,18.82,17.38,23.28,16.62,20.21,,12.31,16.95,,25.13,18.62,21.91,22.1,18.56
+52,19.49,17.65,23.62,16.42,20.52,12.95,12.35,17.64,24.65,25.33,18.73,22.06,21.95,
+53,19.51,17.82,23.12,16.53,20.07,13.01,12.39,17.16,24.72,25.32,18.46,21.7,,18.78
+54,,,,,20.59,13.31,,,25.57,25.34,18.77,22.13,22.93,19.02
+55,19.66,18.04,23.98,17.26,20.34,13.32,12.44,17.68,25.69,,19.35,22.77,22.73,19.28
+56,20.06,18.02,24.05,16.93,20.49,13.36,12.53,17.26,25.12,26.88,19.28,22.67,22.61,19.38
+57,20.09,18.29,24.14,,20.9,13.53,,17.37,25.44,27.18,19.76,23.2,22.64,19.37
+58,20.19,18.46,24.1,17.12,21.0,13.44,12.98,17.8,25.63,27.42,19.87,23.39,23.22,19.64
+59,20.38,,24.34,17.46,,13.73,13.03,17.85,25.93,27.83,19.8,,23.9,20.28
+60,21.1,18.66,23.59,,21.19,13.77,12.97,17.9,25.85,,,22.98,23.55,20.03
+61,20.83,18.91,24.39,17.94,,,13.08,18.17,26.66,29.22,20.13,,24.2,
+62,21.21,18.76,24.64,17.79,21.63,14.15,13.08,18.63,26.99,28.11,20.23,23.92,24.39,20.67
+63,21.63,18.98,24.69,,21.64,14.12,12.96,,27.1,28.51,,,24.34,20.78
+64,22.03,,25.53,18.13,22.0,14.11,13.23,,27.11,29.18,20.06,,24.83,21.09
+65,22.03,19.34,25.04,18.23,22.3,14.16,13.58,,27.68,29.55,,24.47,24.71,
+66,22.27,19.64,,18.77,22.77,14.69,,,28.05,29.55,21.25,25.16,,21.81
+67,22.86,,26.41,18.91,23.19,14.74,13.36,19.61,28.3,29.48,,25.25,25.61,
+68,23.7,20.08,26.96,18.88,23.14,14.75,13.44,,28.35,30.09,21.36,,25.68,22.57
+69,25.53,20.39,26.46,19.14,,14.91,13.83,19.81,,30.76,21.74,,25.96,22.4
+70,24.61,20.57,26.58,19.66,23.92,15.11,13.92,20.26,29.26,31.46,22.25,26.48,,23.18
+71,24.7,20.64,,20.0,,,14.12,20.77,29.44,31.87,22.11,26.33,26.67,23.26
+72,24.73,21.04,26.97,20.44,23.97,15.4,14.19,,,32.11,22.05,,26.67,23.76
+73,24.24,21.22,26.94,20.55,24.28,15.42,14.56,,29.76,32.73,,26.68,27.18,23.95
+74,24.13,21.26,27.44,20.99,24.78,15.73,14.46,21.29,30.01,32.0,22.55,26.88,27.72,
+75,24.59,21.69,28.04,21.1,24.76,15.97,,21.82,30.58,32.53,22.44,26.78,28.55,
+76,25.07,22.01,28.63,21.44,,15.92,14.81,22.35,,32.07,22.59,26.99,28.82,25.22
+77,25.43,22.4,28.38,22.05,25.39,16.37,15.24,22.47,31.47,33.44,23.18,27.68,,25.47
+78,26.49,22.21,28.61,,25.81,16.64,15.15,23.3,31.98,34.06,,28.25,29.14,25.96
+79,26.64,22.37,29.03,22.37,26.41,,15.49,,32.09,,23.69,28.39,29.21,26.37
+80,27.85,,30.06,22.68,26.49,,15.49,23.51,32.47,35.22,24.08,28.92,29.91,26.53
+81,,23.02,29.98,22.42,26.84,17.02,15.54,,32.68,35.66,23.97,28.85,30.04,26.78
+82,26.82,23.2,30.2,22.73,,16.87,15.76,24.26,32.4,,24.83,29.14,30.46,26.55
+83,27.56,23.05,30.07,22.93,26.29,16.84,16.39,23.86,33.1,35.92,24.48,29.14,30.81,27.05
+84,26.18,23.49,30.43,22.96,26.03,16.72,16.03,23.72,,35.66,,28.98,30.57,27.11
+85,25.74,23.68,30.23,23.08,26.44,17.05,16.34,,32.34,35.62,24.55,,31.13,26.79
+86,,23.79,30.21,23.31,26.45,17.18,16.2,24.1,,35.37,25.16,,,
+87,25.9,24.04,30.74,,26.68,17.37,16.33,24.09,33.07,35.61,,30.01,31.44,
+88,26.22,24.3,31.39,,26.44,,16.65,23.72,33.52,,,29.88,31.41,27.75
+89,26.67,24.51,31.63,23.73,26.9,17.66,16.4,23.78,32.88,36.63,25.7,29.4,32.0,27.92
+90,27.79,24.81,31.13,23.93,27.01,17.82,16.32,24.3,33.19,37.57,26.88,30.47,32.1,27.62
+91,27.27,24.88,30.89,24.06,26.83,,,24.46,,37.64,26.38,,32.43,28.06
+92,,25.29,34.01,,26.76,18.04,,25.1,34.64,38.36,26.57,31.42,32.48,28.29
+93,28.69,25.27,34.58,24.42,27.01,,16.95,24.92,33.61,38.41,27.76,30.34,32.57,28.59
+94,27.82,25.74,32.81,,27.67,18.35,16.97,25.06,34.78,39.48,29.21,31.7,32.66,28.11
+95,29.96,25.61,33.1,24.34,27.82,18.38,16.99,25.41,34.92,39.01,30.26,31.26,,28.29
+96,30.13,26.12,,24.6,27.78,18.43,17.24,25.99,35.74,39.24,27.26,31.68,32.79,
+97,29.34,25.97,33.54,25.06,28.02,,17.16,,,39.41,28.05,30.94,33.33,
+98,,,36.22,,27.94,18.7,17.2,26.76,36.64,39.63,28.32,32.19,33.89,
+99,30.16,26.29,35.35,25.54,28.17,18.97,17.41,26.5,37.01,39.86,28.69,31.51,34.46,28.94
+100,29.29,26.6,35.07,26.02,29.01,19.0,,26.57,38.91,40.75,28.84,31.03,34.1,29.29
+101,30.55,26.93,35.65,,28.63,19.11,17.55,26.85,,,29.5,31.03,33.78,28.96
+102,31.24,27.32,36.23,26.01,28.75,19.27,17.92,27.15,38.44,42.11,,32.31,34.14,29.2
+103,31.39,26.93,37.14,26.48,29.1,,,27.21,,42.77,29.81,,34.88,29.67
+104,32.25,,37.48,26.58,28.9,19.7,17.9,27.79,,42.07,29.68,32.53,34.79,29.7
+105,31.9,27.71,37.41,26.65,29.31,,18.24,27.84,38.41,42.51,28.83,32.39,,29.7
+106,32.49,28.03,37.7,26.98,29.86,19.95,,27.73,38.73,41.62,29.74,33.12,35.34,
+107,32.89,28.05,37.84,26.87,29.62,20.0,18.59,28.14,39.52,42.01,29.77,33.57,35.46,30.12
+108,32.71,,,26.93,29.51,20.23,18.86,27.85,38.76,42.47,29.52,33.54,,30.22
+109,,28.13,38.34,27.5,,20.43,,28.05,39.13,42.74,,33.74,35.88,29.91
+110,,28.51,38.46,27.5,30.27,20.46,19.18,28.24,39.31,42.3,30.04,33.72,36.5,30.1
+111,34.58,28.53,38.19,27.37,30.4,20.72,18.99,28.56,40.07,41.9,30.04,33.38,,29.83
+112,30.8,29.08,38.27,27.61,,21.01,18.83,27.71,39.64,42.27,29.86,33.43,37.07,
+113,33.77,28.28,37.89,28.01,31.15,21.12,19.41,27.77,40.75,42.42,30.34,33.64,37.05,30.6
+114,32.45,28.92,38.46,28.43,31.23,21.24,19.24,27.89,40.74,42.08,31.64,34.49,37.26,31.18
+115,33.29,29.02,39.63,28.47,31.73,21.37,19.41,28.49,40.49,42.38,32.5,34.64,37.84,31.45
+116,33.66,29.3,40.01,28.92,32.5,21.7,19.49,28.9,40.18,44.42,32.63,34.39,37.85,32.12
+117,33.51,29.11,38.95,28.98,31.77,22.02,20.0,28.51,39.74,45.06,33.2,34.16,37.85,32.0
+118,33.21,29.74,39.41,29.44,32.28,22.19,20.26,29.04,40.02,43.99,32.36,35.2,37.91,32.29
+119,32.9,29.77,39.77,29.69,32.73,22.34,20.4,29.43,40.05,45.06,32.33,36.12,38.32,32.7
+120,33.7,30.41,40.86,29.28,33.24,22.69,20.59,29.7,40.17,45.84,32.11,36.67,38.44,33.23
+121,32.93,30.39,41.71,29.66,33.3,22.82,21.5,30.12,40.64,46.8,32.63,36.62,39.22,33.59
+122,34.39,30.8,41.01,30.17,33.85,23.2,21.45,29.98,40.85,45.63,33.22,36.66,39.86,33.85
diff --git a/python/cuml/test/ts_datasets/population_estimate_missing.csv b/python/cuml/test/ts_datasets/population_estimate_missing.csv
new file mode 100644
index 0000000000..fe6f3ea724
--- /dev/null
+++ b/python/cuml/test/ts_datasets/population_estimate_missing.csv
@@ -0,0 +1,138 @@
+,Male,Female
+0,238.1,183.2
+1,,194.2
+2,252.5,201.6
+3,264.6,211.5
+4,281.8,225.5
+5,292.3,236.2
+6,299.4,
+7,307.7,
+8,,265.9
+9,331.0,277.4
+10,,282.8
+11,,290.9
+12,347.4,297.9
+13,347.8,301.6
+14,351.4,306.6
+15,,312.1
+16,359.0,317.0
+17,368.0,
+18,380.5,333.8
+19,386.6,341.5
+20,392.6,348.1
+21,398.7,355.4
+22,,362.5
+23,413.8,369.5
+24,420.4,376.0
+25,425.3,382.8
+26,437.3,
+27,449.0,402.0
+28,462.8,
+29,477.1,423.6
+30,490.5,435.1
+31,507.2,
+32,518.2,459.0
+33,535.9,472.5
+34,545.9,484.7
+35,555.5,
+36,,509.0
+37,,
+38,595.6,538.9
+39,,551.2
+40,,562.2
+41,575.8,574.5
+42,563.3,584.1
+43,,590.2
+44,627.8,599.4
+45,643.7,613.9
+46,,631.8
+47,673.8,
+48,686.0,657.1
+49,700.0,
+50,,
+51,730.6,699.0
+52,740.8,709.3
+53,749.1,717.8
+54,758.5,727.1
+55,,738.9
+56,775.6,747.1
+57,,
+58,,760.8
+59,792.0,766.4
+60,796.7,773.0
+61,804.3,780.3
+62,813.1,788.7
+63,821.7,
+64,,
+65,813.0,820.6
+66,799.2,832.0
+67,793.7,842.7
+68,790.8,851.2
+69,813.6,862.7
+70,855.9,872.6
+71,,891.1
+72,913.6,909.5
+73,934.3,927.6
+74,949.4,942.6
+75,967.3,960.3
+76,989.5,981.0
+77,1017.9,
+78,1043.1,1031.6
+79,,
+80,1089.1,
+81,,1098.0
+82,1137.8,1125.0
+83,1165.6,1150.3
+84,1186.1,1173.7
+85,1207.9,1195.6
+86,1238.0,1223.3
+87,1264.1,
+88,1288.4,
+89,,1304.0
+90,1336.7,1327.1
+91,1360.3,1351.0
+92,1373.6,1371.4
+93,,1387.6
+94,,1404.2
+95,1425.4,1426.7
+96,,1451.1
+97,1477.8,1481.9
+98,1510.0,1514.9
+99,1543.9,1548.0
+100,,1576.1
+101,1578.1,
+102,1578.4,1588.0
+103,1575.9,
+104,1573.8,1590.1
+105,1581.5,
+106,,
+107,1601.9,1624.9
+108,1620.7,1644.1
+109,1632.2,1660.8
+110,,1666.3
+111,,1674.3
+112,1652.9,1689.2
+113,1649.7,
+114,1659.7,1710.1
+115,1681.9,1728.5
+116,,
+117,1749.1,1803.1
+118,1772.5,1825.4
+119,1797.8,1850.4
+120,1828.0,1878.7
+121,1855.4,1906.9
+122,1872.9,1929.7
+123,1883.3,1945.9
+124,1891.7,1959.5
+125,1900.4,1972.6
+126,1920.5,1995.7
+127,1956.7,2032.9
+128,1991.8,2069.8
+129,2016.2,2098.1
+130,2037.7,2123.3
+131,2061.8,2149.6
+132,2083.4,2169.2
+133,2104.1,2187.4
+134,2134.0,2213.2
+135,2158.2,2234.9
+136,2174.3,2248.4
diff --git a/python/cuml/tsa/arima.pyx b/python/cuml/tsa/arima.pyx
index acb2ae927d..a92be021b3 100644
--- a/python/cuml/tsa/arima.pyx
+++ b/python/cuml/tsa/arima.pyx
@@ -67,6 +67,9 @@ cdef extern from "cuml/tsa/batched_arima.hpp" namespace "ML":
         handle_t& handle, ARIMAParams[double]& params,
         const ARIMAOrder& order, int batch_size, const double* param_vec)
 
+    bool detect_missing(
+        handle_t& handle, const double* d_y, int n_elem)
+
     void batched_diff(
         handle_t& handle, double* d_y_diff, const double* d_y, int batch_size,
         int n_obs, const ARIMAOrder& order)
@@ -74,14 +77,14 @@ cdef extern from "cuml/tsa/batched_arima.hpp" namespace "ML":
     void batched_loglike(
         handle_t& handle, const ARIMAMemory[double]& arima_mem,
         const double* y, int batch_size, int nobs, const ARIMAOrder& order,
-        const double* params, double* loglike, double* d_vs, bool trans,
-        bool host_loglike, LoglikeMethod method, int truncate)
+        const double* params, double* loglike, bool trans, bool host_loglike,
+        LoglikeMethod method, int truncate)
 
     void batched_loglike(
         handle_t& handle, const ARIMAMemory[double]& arima_mem,
         const double* y, int batch_size, int n_obs, const ARIMAOrder& order,
-        const ARIMAParams[double]& params, double* loglike, double* d_vs,
-        bool trans, bool host_loglike, LoglikeMethod method, int truncate)
+        const ARIMAParams[double]& params, double* loglike, bool trans,
+        bool host_loglike, LoglikeMethod method, int truncate)
 
     void batched_loglike_grad(
         handle_t& handle, const ARIMAMemory[double]& arima_mem,
@@ -103,7 +106,7 @@ cdef extern from "cuml/tsa/batched_arima.hpp" namespace "ML":
 
     void estimate_x0(
         handle_t& handle, ARIMAParams[double]& params, const double* d_y,
-        int batch_size, int nobs, const ARIMAOrder& order)
+        int batch_size, int nobs, const ARIMAOrder& order, bool missing)
 
 
 cdef extern from "cuml/tsa/batched_kalman.hpp" namespace "ML":
@@ -150,7 +153,8 @@ class ARIMA(Base):
     See https://en.wikipedia.org/wiki/Autoregressive_integrated_moving_average
 
     This class can fit an ARIMA(p,d,q) or ARIMA(p,d,q)(P,D,Q)_s model to a
-    batch of time series of the same length with no missing values.
+    batch of time series of the same length (or various lengths, using missing
+    values at the start for padding).
     The implementation is designed to give the best performance when using
     large batches of time series.
 
@@ -160,6 +164,7 @@ class ARIMA(Base):
         The time series data, assumed to have each time series in columns.
         Acceptable formats: cuDF DataFrame, cuDF Series, NumPy ndarray,
         Numba device ndarray, cuda array interface compliant array like CuPy.
+        Missing values are accepted, represented by NaN.
     order : Tuple[int, int, int]
         The ARIMA order (p, d, q) of the model
     seasonal_order: Tuple[int, int, int, int]
@@ -364,18 +369,30 @@ class ARIMA(Base):
         the CumlArrayDescriptors work
         """
 
-        # Compute the differenced series
         cdef uintptr_t d_y_ptr = self.d_y.ptr
         cdef uintptr_t d_y_diff_ptr = self._d_y_diff.ptr
         cdef handle_t* handle_ = <handle_t*><size_t>self.handle.getHandle()
-        batched_diff(handle_[0], <double*> d_y_diff_ptr, <double*> d_y_ptr,
-                     <int> self.batch_size, <int> self.n_obs, self.order)
-
-        # Create a version of the order for the differenced series
         cdef ARIMAOrder cpp_order_diff = self.order
-        cpp_order_diff.d = 0
-        cpp_order_diff.D = 0
-        self.order_diff = cpp_order_diff
+
+        # Detect missing observations
+        self.missing = detect_missing(handle_[0], <double*> d_y_ptr,
+                                      <int> self.batch_size * self.n_obs)
+        if self.missing and self.simple_differencing:
+            logger.warn("Missing observations detected."
+                        " Forcing simple_differencing=False")
+            self.simple_differencing = False
+
+        if self.simple_differencing:
+            # Compute the differenced series
+            batched_diff(handle_[0], <double*> d_y_diff_ptr, <double*> d_y_ptr,
+                         <int> self.batch_size, <int> self.n_obs, self.order)
+
+            # Create a version of the order for the differenced series
+            cpp_order_diff.d = 0
+            cpp_order_diff.D = 0
+            self.order_diff = cpp_order_diff
+        else:
+            self.order_diff = None
 
     def __str__(self):
         cdef ARIMAOrder order = self.order
@@ -574,9 +591,9 @@ class ARIMA(Base):
                              " the data and the prediction")
         elif end <= start:
             raise ValueError("ERROR(`predict`): end <= start")
-        elif start < order.d + order.D * order.s:
-            logger.warn("WARNING(`predict`): predictions before {} are"
-                        " undefined, will be set to NaN"
+        elif self.simple_differencing and start < order.d + order.D * order.s:
+            logger.warn("Predictions before {} are undefined when using"
+                        " simple_differencing=True, will be set to NaN"
                         .format(order.d + order.D * order.s))
 
         if level is not None:
@@ -710,7 +727,8 @@ class ARIMA(Base):
 
         # Call C++ function
         estimate_x0(handle_[0], cpp_params, <double*> d_y_ptr,
-                    <int> self.batch_size, <int> self.n_obs, order)
+                    <int> self.batch_size, <int> self.n_obs, order,
+                    <bool> self.missing)
 
     @cuml.internals.api_base_return_any_skipall
     def fit(self,
@@ -798,6 +816,10 @@ class ARIMA(Base):
         method = method.lower()
         if method not in {"css", "css-ml", "ml"}:
             raise ValueError("Unknown method: {}".format(method))
+        if self.missing and (method == "css" or method == "css-ml"):
+            logger.warn("Missing observations detected."
+                        " Forcing method=\"ml\"")
+            method = "ml"
         if method == "css" or method == "css-ml":
             x, self.niter = fit_helper(x0, "css")
         if method == "css-ml" or method == "ml":
@@ -849,11 +871,7 @@ class ARIMA(Base):
 
         cdef handle_t* handle_ = <handle_t*><size_t>self.handle.getHandle()
 
-        # TODO: don't create vs array every time!
         n_obs_kf = (self.n_obs_diff if diff else self.n_obs)
-        d_vs = CumlArray.empty((n_obs_kf, self.batch_size), dtype=np.float64,
-                               order="F")
-        cdef uintptr_t d_vs_ptr = d_vs.ptr
 
         cdef uintptr_t d_temp_mem = self._temp_mem.ptr
         arima_mem_ptr = new ARIMAMemory[double](
@@ -863,8 +881,7 @@ class ARIMA(Base):
         batched_loglike(handle_[0], arima_mem_ptr[0], <double*> d_y_kf_ptr,
                         <int> self.batch_size, <int> n_obs_kf, order_kf,
                         <double*> d_x_ptr, <double*> vec_loglike.data(),
-                        <double*> d_vs_ptr, <bool> trans, <bool> True,
-                        ll_method, <int> truncate)
+                        <bool> trans, <bool> True, ll_method, <int> truncate)
 
         del arima_mem_ptr
 
@@ -964,10 +981,6 @@ class ARIMA(Base):
         cdef LoglikeMethod ll_method = MLE
         diff = self.simple_differencing
 
-        d_vs = CumlArray.empty((n_obs_kf, self.batch_size), dtype=np.float64,
-                               order="F")
-        cdef uintptr_t d_vs_ptr = d_vs.ptr
-
         cdef uintptr_t d_temp_mem = self._temp_mem.ptr
         arima_mem_ptr = new ARIMAMemory[double](
             order, <int> self.batch_size, <int> self.n_obs,
@@ -976,8 +989,7 @@ class ARIMA(Base):
         batched_loglike(handle_[0], arima_mem_ptr[0], <double*> d_y_kf_ptr,
                         <int> self.batch_size, <int> n_obs_kf, order_kf,
                         cpp_params, <double*> vec_loglike.data(),
-                        <double*> d_vs_ptr, <bool> False, <bool> True,
-                        ll_method, <int> 0)
+                        <bool> False, <bool> True, ll_method, <int> 0)
 
         del arima_mem_ptr
 
diff --git a/python/cuml/tsa/auto_arima.pyx b/python/cuml/tsa/auto_arima.pyx
index 52bc1bd57d..8c007cd4f9 100644
--- a/python/cuml/tsa/auto_arima.pyx
+++ b/python/cuml/tsa/auto_arima.pyx
@@ -99,6 +99,10 @@ cdef extern from "cuml/tsa/auto_arima.h" namespace "ML":
         const int* d_id_to_pos, const int* d_id_to_sub, double* d_out,
         int batch_size, int n_sub, int n_obs)
 
+cdef extern from "cuml/tsa/batched_arima.hpp" namespace "ML":
+    bool detect_missing(
+        handle_t& handle, const double* d_y, int n_elem)
+
 tests_map = {
     "kpss": kpss_test,
     "seas": seas_test,
@@ -195,6 +199,21 @@ class AutoARIMA(Base):
 
         self.simple_differencing = simple_differencing
 
+        self._initial_calc()
+
+    @cuml.internals.api_base_return_any_skipall
+    def _initial_calc(self):
+        cdef uintptr_t d_y_ptr = self.d_y.ptr
+        cdef handle_t* handle_ = <handle_t*><size_t>self.handle.getHandle()
+
+        # Detect missing observations
+        missing = detect_missing(handle_[0], <double*> d_y_ptr,
+                                 <int> self.batch_size * self.n_obs)
+
+        if missing:
+            raise ValueError(
+                "Missing observations are not supported in AutoARIMA yet")
+
     @cuml.internals.api_return_any()
     def search(self,
                s=None,
diff --git a/python/setup.py b/python/setup.py
index 7c4f88782b..06f4722fe3 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -210,11 +210,20 @@ class cuml_build_ext(cython_build_ext, object):
     boolean_options = ["singlegpu"] + cython_build_ext.boolean_options
 
     def build_extensions(self):
-        try:
-            # Silence the '-Wstrict-prototypes' warning
-            self.compiler.compiler_so.remove("-Wstrict-prototypes")
-        except Exception:
-            pass
+        def remove_flags(compiler, *flags):
+            for flag in flags:
+                try:
+                    compiler.compiler_so = list(
+                        filter((flag).__ne__, compiler.compiler_so)
+                    )
+                except Exception:
+                    pass
+        # Full optimization
+        self.compiler.compiler_so.append("-O3")
+        # No debug symbols, full optimization, no '-Wstrict-prototypes' warning
+        remove_flags(
+            self.compiler, "-g", "-G", "-O1", "-O2", "-Wstrict-prototypes"
+        )
         cython_build_ext.build_extensions(self)
 
     def initialize_options(self):

From 688b89e82da3699e05adff2d741bd6733db47895 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Thu, 4 Nov 2021 15:29:07 -0500
Subject: [PATCH 18/18] FIX Remove tsne warning

---
 python/cuml/manifold/t_sne.pyx | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx
index 1a1a19c635..516ea842f6 100644
--- a/python/cuml/manifold/t_sne.pyx
+++ b/python/cuml/manifold/t_sne.pyx
@@ -280,11 +280,6 @@ class TSNE(Base,
         if n_components < 0:
             raise ValueError("n_components = {} should be more "
                              "than 0.".format(n_components))
-        # Enable warning once n_components >= 2 is supported.
-        # if n_components != 2 and (method == 'barnes_hut' or method == 'fft'):
-        #     warnings.warn("Barnes Hut and FFT only work when "
-        #                   "n_components == 2. Switching to exact.")
-        #     method = 'exact'
         if n_components != 2:
             raise ValueError("Currently TSNE supports n_components = 2; "
                              "but got n_components = {}".format(n_components))