diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2b1fe8aa0..9b2ae0ce0 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -379,18 +379,16 @@ add_library(
   src/neighbors/detail/refine_host_int8_t_float.cpp
   src/neighbors/detail/refine_host_uint8_t_float.cpp
   src/neighbors/ivf_flat_index.cpp
-  src/neighbors/ivf_flat/ivf_flat_build_float_int64_t.cpp
-  src/neighbors/ivf_flat/ivf_flat_build_int8_t_int64_t.cpp
-  src/neighbors/ivf_flat/ivf_flat_build_uint8_t_int64_t.cpp
-  src/neighbors/ivf_flat/ivf_flat_extend_float_int64_t.cpp
-  src/neighbors/ivf_flat/ivf_flat_extend_int8_t_int64_t.cpp
-  src/neighbors/ivf_flat/ivf_flat_extend_uint8_t_int64_t.cpp
-  src/neighbors/ivf_flat/ivf_flat_search_float_int64_t.cpp
-  src/neighbors/ivf_flat/ivf_flat_search_int8_t_int64_t.cpp
-  src/neighbors/ivf_flat/ivf_flat_search_uint8_t_int64_t.cpp
-  src/neighbors/ivf_flat/ivf_flat_serialize_float_int64_t.cpp
-  src/neighbors/ivf_flat/ivf_flat_serialize_int8_t_int64_t.cpp
-  src/neighbors/ivf_flat/ivf_flat_serialize_uint8_t_int64_t.cpp
+  src/neighbors/ivf_flat/ivf_flat_build_extend_float_int64_t.cu
+  src/neighbors/ivf_flat/ivf_flat_build_extend_int8_t_int64_t.cu
+  src/neighbors/ivf_flat/ivf_flat_build_extend_uint8_t_int64_t.cu
+  src/neighbors/ivf_flat/ivf_flat_helpers.cu
+  src/neighbors/ivf_flat/ivf_flat_search_float_int64_t.cu
+  src/neighbors/ivf_flat/ivf_flat_search_int8_t_int64_t.cu
+  src/neighbors/ivf_flat/ivf_flat_search_uint8_t_int64_t.cu
+  src/neighbors/ivf_flat/ivf_flat_serialize_float_int64_t.cu
+  src/neighbors/ivf_flat/ivf_flat_serialize_int8_t_int64_t.cu
+  src/neighbors/ivf_flat/ivf_flat_serialize_uint8_t_int64_t.cu
   src/neighbors/ivf_pq_index.cpp
   src/neighbors/ivf_pq/ivf_pq_build_common.cu
   src/neighbors/ivf_pq/ivf_pq_serialize.cu
diff --git a/cpp/include/cuvs/neighbors/ivf_flat.hpp b/cpp/include/cuvs/neighbors/ivf_flat.hpp
index efb32e024..33daedc78 100644
--- a/cpp/include/cuvs/neighbors/ivf_flat.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_flat.hpp
@@ -17,13 +17,19 @@
 #pragma once
 
 #include "ann_types.hpp"
-#include <raft/neighbors/ivf_flat_types.hpp>
+#include "ivf_list.hpp"
+#include <cstdint>
+#include <cuvs/neighbors/sample_filter.hpp>
 
 namespace cuvs::neighbors::ivf_flat {
 /**
  * @defgroup ivf_flat_cpp_index_params IVF-Flat index build parameters
  * @{
  */
+
+/** Size of the interleaved group (see `index::data` description). */
+constexpr static uint32_t kIndexGroupSize = 32;
+
 struct index_params : ann::index_params {
   /** The number of inverted lists (clusters) */
   uint32_t n_lists = 1024;
@@ -54,22 +60,6 @@ struct index_params : ann::index_params {
    * flag to `true` if you prefer to use as little GPU memory for the database as possible.
    */
   bool conservative_memory_allocation = false;
-
-  /** Build a raft IVF_FLAT index params from an existing cuvs IVF_FLAT index params. */
-  operator raft::neighbors::ivf_flat::index_params() const
-  {
-    return raft::neighbors::ivf_flat::index_params{
-      {
-        .metric            = static_cast<raft::distance::DistanceType>((int)this->metric),
-        .metric_arg        = this->metric_arg,
-        .add_data_on_build = this->add_data_on_build,
-      },
-      .n_lists                        = n_lists,
-      .kmeans_n_iters                 = kmeans_n_iters,
-      .kmeans_trainset_fraction       = kmeans_trainset_fraction,
-      .adaptive_centers               = adaptive_centers,
-      .conservative_memory_allocation = conservative_memory_allocation};
-  }
 };
 /**
  * @}
@@ -82,14 +72,45 @@ struct index_params : ann::index_params {
 struct search_params : ann::search_params {
   /** The number of clusters to search. */
   uint32_t n_probes = 20;
+};
+
+static_assert(std::is_aggregate_v<index_params>);
+static_assert(std::is_aggregate_v<search_params>);
+
+template <typename SizeT, typename ValueT, typename IdxT>
+struct list_spec {
+  using value_type   = ValueT;
+  using list_extents = raft::matrix_extent<SizeT>;
+  using index_type   = IdxT;
+
+  SizeT align_max;
+  SizeT align_min;
+  uint32_t dim;
+
+  constexpr list_spec(uint32_t dim, bool conservative_memory_allocation)
+    : dim(dim),
+      align_min(kIndexGroupSize),
+      align_max(conservative_memory_allocation ? kIndexGroupSize : 1024)
+  {
+  }
+
+  // Allow casting between different size-types (for safer size and offset calculations)
+  template <typename OtherSizeT>
+  constexpr explicit list_spec(const list_spec<OtherSizeT, ValueT, IdxT>& other_spec)
+    : dim{other_spec.dim}, align_min{other_spec.align_min}, align_max{other_spec.align_max}
+  {
+  }
 
-  /** Build a raft IVF_FLAT search params from an existing cuvs IVF_FLAT search params. */
-  operator raft::neighbors::ivf_flat::search_params() const
+  /** Determine the extents of an array enough to hold a given amount of data. */
+  constexpr auto make_list_extents(SizeT n_rows) const -> list_extents
   {
-    raft::neighbors::ivf_flat::search_params result = {{}, n_probes};
-    return result;
+    return raft::make_extents<SizeT>(n_rows, dim);
   }
 };
+
+template <typename ValueT, typename IdxT, typename SizeT = uint32_t>
+using list_data = ivf::list<list_spec, SizeT, ValueT, IdxT>;
+
 /**
  * @}
  */
@@ -125,7 +146,6 @@ struct index : ann::index {
         bool adaptive_centers,
         bool conservative_memory_allocation,
         uint32_t dim);
-  index(raft::neighbors::ivf_flat::index<T, IdxT>&& raft_idx);
 
   /**
    * Vectorized load/store size in elements, determines the size of interleaved data chunks.
@@ -181,6 +201,19 @@ struct index : ann::index {
   std::optional<raft::device_vector_view<float, uint32_t>> center_norms() noexcept;
   std::optional<raft::device_vector_view<const float, uint32_t>> center_norms() const noexcept;
 
+  /**
+   * Accumulated list sizes, sorted in descending order [n_lists + 1].
+   * The last value contains the total length of the index.
+   * The value at index zero is always zero.
+   *
+   * That is, the content of this span is as if the `list_sizes` was sorted and then accumulated.
+   *
+   * This span is used during search to estimate the maximum size of the workspace.
+   */
+  auto accum_sorted_sizes() noexcept -> raft::host_vector_view<IdxT, uint32_t>;
+  [[nodiscard]] auto accum_sorted_sizes() const noexcept
+    -> raft::host_vector_view<const IdxT, uint32_t>;
+
   /** Total length of the index. */
   IdxT size() const noexcept;
 
@@ -202,23 +235,44 @@ struct index : ann::index {
    */
   bool conservative_memory_allocation() const noexcept;
 
+  void allocate_center_norms(raft::resources const& res);
+
   /** Lists' data and indices. */
-  std::vector<std::shared_ptr<raft::neighbors::ivf_flat::list_data<T, IdxT>>>& lists() noexcept;
-  const std::vector<std::shared_ptr<raft::neighbors::ivf_flat::list_data<T, IdxT>>>& lists()
-    const noexcept;
+  std::vector<std::shared_ptr<list_data<T, IdxT>>>& lists() noexcept;
+  const std::vector<std::shared_ptr<list_data<T, IdxT>>>& lists() const noexcept;
 
-  // Get pointer to underlying RAFT index, not meant to be used outside of cuVS
-  inline raft::neighbors::ivf_flat::index<T, IdxT>* get_raft_index() noexcept
-  {
-    return raft_index_.get();
-  }
-  inline const raft::neighbors::ivf_flat::index<T, IdxT>* get_raft_index() const noexcept
-  {
-    return raft_index_.get();
-  }
+  void check_consistency();
 
  private:
-  std::unique_ptr<raft::neighbors::ivf_flat::index<T, IdxT>> raft_index_;
+  /**
+   * TODO: in theory, we can lift this to the template parameter and keep it at hardware maximum
+   * possible value by padding the `dim` of the data https://github.com/rapidsai/raft/issues/711
+   */
+  uint32_t veclen_;
+  cuvs::distance::DistanceType metric_;
+  bool adaptive_centers_;
+  bool conservative_memory_allocation_;
+  std::vector<std::shared_ptr<list_data<T, IdxT>>> lists_;
+  raft::device_vector<uint32_t, uint32_t> list_sizes_;
+  raft::device_matrix<float, uint32_t, raft::row_major> centers_;
+  std::optional<raft::device_vector<float, uint32_t>> center_norms_;
+
+  // Computed members
+  raft::device_vector<T*, uint32_t> data_ptrs_;
+  raft::device_vector<IdxT*, uint32_t> inds_ptrs_;
+  raft::host_vector<IdxT, uint32_t> accum_sorted_sizes_;
+
+  static auto calculate_veclen(uint32_t dim) -> uint32_t
+  {
+    // TODO: consider padding the dimensions and fixing veclen to its maximum possible value as a
+    // template parameter (https://github.com/rapidsai/raft/issues/711)
+
+    // NOTE: keep this consistent with the select_interleaved_scan_kernel logic
+    // in detail/ivf_flat_interleaved_scan-inl.cuh.
+    uint32_t veclen = std::max<uint32_t>(1, 16 / sizeof(T));
+    if (dim % veclen != 0) { veclen = 1; }
+    return veclen;
+  }
 };
 /**
  * @}
@@ -368,6 +422,147 @@ void build(raft::resources const& handle,
            const cuvs::neighbors::ivf_flat::index_params& index_params,
            raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> dataset,
            cuvs::neighbors::ivf_flat::index<uint8_t, int64_t>& idx);
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_flat::build(handle, dataset, index_params);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset raft::host_matrix_view to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed ivf-flat index
+ */
+auto build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_flat::index_params& index_params,
+           raft::host_matrix_view<const float, int64_t, raft::row_major> dataset)
+  -> cuvs::neighbors::ivf_flat::index<float, int64_t>;
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   ivf_flat::index<decltype(dataset::value_type), decltype(dataset::index_type)> index;
+ *   ivf_flat::build(handle, dataset, index_params, index);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset raft::host_matrix_view to a row-major matrix [n_rows, dim]
+ * @param[out] idx reference to ivf_flat::index
+ *
+ */
+void build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_flat::index_params& index_params,
+           raft::host_matrix_view<const float, int64_t, raft::row_major> dataset,
+           cuvs::neighbors::ivf_flat::index<float, int64_t>& idx);
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_flat::build(handle, dataset, index_params);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset a host pointer to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed ivf-flat index
+ */
+auto build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_flat::index_params& index_params,
+           raft::host_matrix_view<const int8_t, int64_t, raft::row_major> dataset)
+  -> cuvs::neighbors::ivf_flat::index<int8_t, int64_t>;
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   ivf_flat::index<decltype(dataset::value_type), decltype(dataset::index_type)> index;
+ *   ivf_flat::build(handle, dataset, index_params, index);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset raft::host_matrix_view to a row-major matrix [n_rows, dim]
+ * @param[out] idx reference to ivf_flat::index
+ *
+ */
+void build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_flat::index_params& index_params,
+           raft::host_matrix_view<const int8_t, int64_t, raft::row_major> dataset,
+           cuvs::neighbors::ivf_flat::index<int8_t, int64_t>& idx);
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_flat::build(handle, dataset, index_params);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset a host pointer to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed ivf-flat index
+ */
+auto build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_flat::index_params& index_params,
+           raft::host_matrix_view<const uint8_t, int64_t, raft::row_major> dataset)
+  -> cuvs::neighbors::ivf_flat::index<uint8_t, int64_t>;
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   ivf_flat::index<decltype(dataset::value_type), decltype(dataset::index_type)> index;
+ *   ivf_flat::build(handle, dataset, index_params, index);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset raft::host_matrix_view to a row-major matrix [n_rows, dim]
+ * @param[out] idx reference to ivf_flat::index
+ *
+ */
+void build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_flat::index_params& index_params,
+           raft::host_matrix_view<const uint8_t, int64_t, raft::row_major> dataset,
+           cuvs::neighbors::ivf_flat::index<uint8_t, int64_t>& idx);
 /**
  * @}
  */
@@ -568,6 +763,198 @@ void extend(raft::resources const& handle,
             raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> new_vectors,
             std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
             cuvs::neighbors::ivf_flat::index<uint8_t, int64_t>* idx);
+
+/**
+ * @brief Build a new index containing the data of the original plus new extra vectors.
+ *
+ * Implementation note:
+ *    The new data is clustered according to existing kmeans clusters, then the cluster
+ *    centers are adjusted to match the newly labeled data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::host_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
+ *   auto index = ivf_flat::extend(handle, new_vectors, no_op, index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors raft::host_matrix_view to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices optional raft::host_vector_view to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] idx original index
+ *
+ * @return the constructed extended ivf-flat index
+ */
+auto extend(raft::resources const& handle,
+            raft::host_matrix_view<const float, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            const cuvs::neighbors::ivf_flat::index<float, int64_t>& idx)
+  -> cuvs::neighbors::ivf_flat::index<float, int64_t>;
+
+/**
+ * @brief Extend the index in-place with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::host_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
+ *   ivf_flat::extend(handle, dataset, no_opt, &index_empty);
+ * @endcode
+ *
+ *
+ * @param[in] handle
+ * @param[in] new_vectors raft::host_matrix_view to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices optional raft::host_vector_view to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[inout] idx pointer to index, to be overwritten in-place
+ */
+void extend(raft::resources const& handle,
+            raft::host_matrix_view<const float, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            cuvs::neighbors::ivf_flat::index<float, int64_t>* idx);
+
+/**
+ * @brief Build a new index containing the data of the original plus new extra vectors.
+ *
+ * Implementation note:
+ *    The new data is clustered according to existing kmeans clusters, then the cluster
+ *    centers are adjusted to match the newly labeled data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, dataset, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::host_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
+ *   auto index = ivf_flat::extend(handle, new_vectors, no_op, index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors raft::host_matrix_view to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices optional raft::host_vector_view to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] idx original index
+ *
+ * @return the constructed extended ivf-flat index
+ */
+auto extend(raft::resources const& handle,
+            raft::host_matrix_view<const int8_t, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            const cuvs::neighbors::ivf_flat::index<int8_t, int64_t>& idx)
+  -> cuvs::neighbors::ivf_flat::index<int8_t, int64_t>;
+
+/**
+ * @brief Extend the index in-place with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::host_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
+ *   ivf_flat::extend(handle, dataset, no_opt, &index_empty);
+ * @endcode
+ *
+ *
+ * @param[in] handle
+ * @param[in] new_vectors raft::host_matrix_view to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices optional raft::host_vector_view to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[inout] idx pointer to index, to be overwritten in-place
+ */
+void extend(raft::resources const& handle,
+            raft::host_matrix_view<const int8_t, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            cuvs::neighbors::ivf_flat::index<int8_t, int64_t>* idx);
+
+/**
+ * @brief Build a new index containing the data of the original plus new extra vectors.
+ *
+ * Implementation note:
+ *    The new data is clustered according to existing kmeans clusters, then the cluster
+ *    centers are adjusted to match the newly labeled data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, dataset, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::host_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
+ *   auto index = ivf_flat::extend(handle, new_vectors, no_op, index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors raft::host_matrix_view to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices optional raft::host_vector_view to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] idx original index
+ *
+ * @return the constructed extended ivf-flat index
+ */
+auto extend(raft::resources const& handle,
+            raft::host_matrix_view<const uint8_t, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            const cuvs::neighbors::ivf_flat::index<uint8_t, int64_t>& idx)
+  -> cuvs::neighbors::ivf_flat::index<uint8_t, int64_t>;
+
+/**
+ * @brief Extend the index in-place with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::host_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
+ *   ivf_flat::extend(handle, dataset, no_opt, &index_empty);
+ * @endcode
+ *
+ *
+ * @param[in] handle
+ * @param[in] new_vectors raft::host_matrix_view to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices optional raft::host_vector_view to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[inout] idx pointer to index, to be overwritten in-place
+ */
+void extend(raft::resources const& handle,
+            raft::host_matrix_view<const uint8_t, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            cuvs::neighbors::ivf_flat::index<uint8_t, int64_t>* idx);
 /**
  * @}
  */
@@ -687,6 +1074,105 @@ void search(raft::resources const& handle,
             raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> queries,
             raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
             raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/**
+ * @brief Search ANN using the constructed index with the given filter.
+ *
+ * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
+ *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`.
+ * The exact size of the temporary buffer depends on multiple factors and is an implementation
+ * detail. However, you can safely specify a small initial size for the memory pool, so that only a
+ * few allocations happen to grow it during the first invocations of the `search`.
+ *
+ * @param[in] handle
+ * @param[in] params configure the search
+ * @param[in] idx ivf-flat constructed index
+ * @param[in] queries a device matrix view to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device matrix view to the distances to the selected neighbors [n_queries,
+ * k]
+ * @param[in] sample_filter a device bitset filter function that greenlights samples for a given
+ * query.
+ */
+void search_with_filtering(
+  raft::resources const& handle,
+  const search_params& params,
+  index<float, int64_t>& idx,
+  raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
+  raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+  raft::device_matrix_view<float, int64_t, raft::row_major> distances,
+  cuvs::neighbors::filtering::bitset_filter<uint32_t, int64_t> sample_filter);
+
+/**
+ * @brief Search ANN using the constructed index with the given filter.
+ *
+ * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
+ *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`.
+ * The exact size of the temporary buffer depends on multiple factors and is an implementation
+ * detail. However, you can safely specify a small initial size for the memory pool, so that only a
+ * few allocations happen to grow it during the first invocations of the `search`.
+ *
+ * @param[in] handle
+ * @param[in] params configure the search
+ * @param[in] idx ivf-flat constructed index
+ * @param[in] queries a device matrix view to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device matrix view to the distances to the selected neighbors [n_queries,
+ * k]
+ * @param[in] sample_filter a device bitset filter function that greenlights samples for a given
+ * query.
+ */
+void search_with_filtering(
+  raft::resources const& handle,
+  const search_params& params,
+  index<int8_t, int64_t>& idx,
+  raft::device_matrix_view<const int8_t, int64_t, raft::row_major> queries,
+  raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+  raft::device_matrix_view<float, int64_t, raft::row_major> distances,
+  cuvs::neighbors::filtering::bitset_filter<uint32_t, int64_t> sample_filter);
+
+/**
+ * @brief Search ANN using the constructed index with the given filter.
+ *
+ * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
+ *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`.
+ * The exact size of the temporary buffer depends on multiple factors and is an implementation
+ * detail. However, you can safely specify a small initial size for the memory pool, so that only a
+ * few allocations happen to grow it during the first invocations of the `search`.
+ *
+ * @param[in] handle
+ * @param[in] params configure the search
+ * @param[in] idx ivf-flat constructed index
+ * @param[in] queries a device matrix view to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device matrix view to the distances to the selected neighbors [n_queries,
+ * k]
+ * @param[in] sample_filter a device bitset filter function that greenlights samples for a given
+ * query.
+ */
+void search_with_filtering(
+  raft::resources const& handle,
+  const search_params& params,
+  index<uint8_t, int64_t>& idx,
+  raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> queries,
+  raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+  raft::device_matrix_view<float, int64_t, raft::row_major> distances,
+  cuvs::neighbors::filtering::bitset_filter<uint32_t, int64_t> sample_filter);
 /**
  * @}
  */
@@ -737,7 +1223,7 @@ void serialize_file(raft::resources const& handle,
  * std::string filename("/path/to/index");
  * using T    = float; // data element type
  * using IdxT = int64_t; // type of the index
- * // create an empty index with `ivf_pq::index<T, IdxT> index(handle, index_params, dim);`
+ * // create an empty index with `ivf_flat::index<T, IdxT> index(handle, index_params, dim);`
  * cuvs::deserialize_file(handle, filename, &index);
  * @endcode
  *
@@ -791,7 +1277,7 @@ void serialize(raft::resources const& handle,
  * std::string str;
  * using T    = float; // data element type
  * using IdxT = int64_t; // type of the index
- * // create an empty index with `ivf_pq::index<T, IdxT> index(handle, index_params, dim);`
+ * // create an empty index with `ivf_flat::index<T, IdxT> index(handle, index_params, dim);`
  * auto index = cuvs::deserialize(handle, str, &index);
  * @endcode
  *
@@ -845,7 +1331,7 @@ void serialize_file(raft::resources const& handle,
  * std::string filename("/path/to/index");
  * using T    = float; // data element type
  * using IdxT = int64_t; // type of the index
- * // create an empty index with `ivf_pq::index<T, IdxT> index(handle, index_params, dim);`
+ * // create an empty index with `ivf_flat::index<T, IdxT> index(handle, index_params, dim);`
  * cuvs::deserialize_file(handle, filename, &index);
  * @endcode
  *
@@ -899,7 +1385,7 @@ void serialize(raft::resources const& handle,
  * std::string str;
  * using T    = float; // data element type
  * using IdxT = int64_t; // type of the index
- * // create an empty index with `ivf_pq::index<T, IdxT> index(handle, index_params, dim);`
+ * // create an empty index with `ivf_flat::index<T, IdxT> index(handle, index_params, dim);`
  * auto index = cuvs::deserialize(handle, str, &index);
  * @endcode
  *
@@ -953,7 +1439,7 @@ void serialize_file(raft::resources const& handle,
  * std::string filename("/path/to/index");
  * using T    = float; // data element type
  * using IdxT = int64_t; // type of the index
- * // create an empty index with `ivf_pq::index<T, IdxT> index(handle, index_params, dim);`
+ * // create an empty index with ivf_flat::index<T, IdxT> index(handle, index_params, dim);`
  * cuvs::deserialize_file(handle, filename, &index);
  * @endcode
  *
@@ -1007,7 +1493,7 @@ void serialize(raft::resources const& handle,
  * std::string str;
  * using T    = float; // data element type
  * using IdxT = int64_t; // type of the index
- * // create an empty index with `ivf_pq::index<T, IdxT> index(handle, index_params, dim);`
+ * // create an empty index with `ivf_flat::index<T, IdxT> index(handle, index_params, dim);`
  * auto index = cuvs::deserialize(handle, str, &index);
  * @endcode
  *
@@ -1023,4 +1509,360 @@ void deserialize(raft::resources const& handle,
 /**
  * @}
  */
+
+namespace helpers {
+
+/**
+ * @defgroup ivf_flat_helpers Helper functions for IVF Flat
+ * @{
+ */
+
+namespace codepacker {
+
+/**
+ * Write flat codes into an existing list by the given offset.
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_vec).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   auto list_data  = index.lists()[label]->data.view();
+ *   // allocate the buffer for the input codes
+ *   auto codes = raft::make_device_matrix<float>(res, n_vec, index.dim());
+ *   ... prepare n_vecs to pack into the list in codes ...
+ *   // write codes into the list starting from the 42nd position
+ *   ivf_flat::helpers::codepacker::pack(
+ *       res, make_const_mdspan(codes.view()), index.veclen(), 42, list_data);
+ * @endcode
+ *
+ * @param[in] res
+ * @param[in] codes flat codes [n_vec, dim]
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset how many records to skip before writing the data into the list
+ * @param[inout] list_data block to write into
+ */
+void pack(raft::resources const& res,
+          raft::device_matrix_view<const float, uint32_t, raft::row_major> codes,
+          uint32_t veclen,
+          uint32_t offset,
+          raft::device_mdspan<float,
+                              typename list_spec<uint32_t, float, int64_t>::list_extents,
+                              raft::row_major> list_data);
+
+/**
+ * Write flat codes into an existing list by the given offset.
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_vec).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   auto list_data  = index.lists()[label]->data.view();
+ *   // allocate the buffer for the input codes
+ *   auto codes = raft::make_device_matrix<int8_t>(res, n_vec, index.dim());
+ *   ... prepare n_vecs to pack into the list in codes ...
+ *   // write codes into the list starting from the 42nd position
+ *   ivf_flat::helpers::codepacker::pack(
+ *       res, make_const_mdspan(codes.view()), index.veclen(), 42, list_data);
+ * @endcode
+ *
+ * @param[in] res
+ * @param[in] codes flat codes [n_vec, dim]
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset how many records to skip before writing the data into the list
+ * @param[inout] list_data block to write into
+ */
+void pack(raft::resources const& res,
+          raft::device_matrix_view<const int8_t, uint32_t, raft::row_major> codes,
+          uint32_t veclen,
+          uint32_t offset,
+          raft::device_mdspan<int8_t,
+                              typename list_spec<uint32_t, int8_t, int64_t>::list_extents,
+                              raft::row_major> list_data);
+
+/**
+ * Write flat codes into an existing list by the given offset.
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_vec).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   auto list_data  = index.lists()[label]->data.view();
+ *   // allocate the buffer for the input codes
+ *   auto codes = raft::make_device_matrix<uint8_t>(res, n_vec, index.dim());
+ *   ... prepare n_vecs to pack into the list in codes ...
+ *   // write codes into the list starting from the 42nd position
+ *   ivf_flat::helpers::codepacker::pack(
+ *       res, make_const_mdspan(codes.view()), index.veclen(), 42, list_data);
+ * @endcode
+ *
+ * @param[in] res
+ * @param[in] codes flat codes [n_vec, dim]
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset how many records to skip before writing the data into the list
+ * @param[inout] list_data block to write into
+ */
+void pack(raft::resources const& res,
+          raft::device_matrix_view<const uint8_t, uint32_t, raft::row_major> codes,
+          uint32_t veclen,
+          uint32_t offset,
+          raft::device_mdspan<uint8_t,
+                              typename list_spec<uint32_t, uint8_t, int64_t>::list_extents,
+                              raft::row_major> list_data);
+
+/**
+ * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given `offset`.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   auto list_data = index.lists()[label]->data.view();
+ *   // allocate the buffer for the output
+ *   uint32_t n_take = 4;
+ *   auto codes = raft::make_device_matrix<float>(res, n_take, index.dim());
+ *   uint32_t offset = 0;
+ *   // unpack n_take elements from the list
+ *   ivf_fat::helpers::codepacker::unpack(res, list_data, index.veclen(), offset, codes.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] list_data block to read from
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset
+ *   How many records in the list to skip.
+ * @param[inout] codes
+ *   the destination buffer [n_take, index.dim()].
+ *   The length `n_take` defines how many records to unpack,
+ *   it must be <= the list size.
+ */
+void unpack(raft::resources const& res,
+            raft::device_mdspan<const float,
+                                typename list_spec<uint32_t, float, int64_t>::list_extents,
+                                raft::row_major> list_data,
+            uint32_t veclen,
+            uint32_t offset,
+            raft::device_matrix_view<float, uint32_t, raft::row_major> codes);
+
+/**
+ * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given `offset`.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   auto list_data = index.lists()[label]->data.view();
+ *   // allocate the buffer for the output
+ *   uint32_t n_take = 4;
+ *   auto codes = raft::make_device_matrix<int8_t>(res, n_take, index.dim());
+ *   uint32_t offset = 0;
+ *   // unpack n_take elements from the list
+ *   ivf_fat::helpers::codepacker::unpack(res, list_data, index.veclen(), offset, codes.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] list_data block to read from
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset
+ *   How many records in the list to skip.
+ * @param[inout] codes
+ *   the destination buffer [n_take, index.dim()].
+ *   The length `n_take` defines how many records to unpack,
+ *   it must be <= the list size.
+ */
+void unpack(raft::resources const& res,
+            raft::device_mdspan<const int8_t,
+                                typename list_spec<uint32_t, int8_t, int64_t>::list_extents,
+                                raft::row_major> list_data,
+            uint32_t veclen,
+            uint32_t offset,
+            raft::device_matrix_view<int8_t, uint32_t, raft::row_major> codes);
+
+/**
+ * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given `offset`.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   auto list_data = index.lists()[label]->data.view();
+ *   // allocate the buffer for the output
+ *   uint32_t n_take = 4;
+ *   auto codes = raft::make_device_matrix<uint8_t>(res, n_take, index.dim());
+ *   uint32_t offset = 0;
+ *   // unpack n_take elements from the list
+ *   ivf_fat::helpers::codepacker::unpack(res, list_data, index.veclen(), offset, codes.view());
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[in] list_data block to read from
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset
+ *   How many records in the list to skip.
+ * @param[inout] codes
+ *   the destination buffer [n_take, index.dim()].
+ *   The length `n_take` defines how many records to unpack,
+ *   it must be <= the list size.
+ */
+void unpack(raft::resources const& res,
+            raft::device_mdspan<const uint8_t,
+                                typename list_spec<uint32_t, uint8_t, int64_t>::list_extents,
+                                raft::row_major> list_data,
+            uint32_t veclen,
+            uint32_t offset,
+            raft::device_matrix_view<uint8_t, uint32_t, raft::row_major> codes);
+
+/**
+ * Write one flat code into a block by the given offset. The offset indicates the id of the record
+ * in the list. This function interleaves the code and is intended to later copy the interleaved
+ * codes over to the IVF list on device. NB: no memory allocation happens here; the block must fit
+ * the record (offset + 1).
+ *
+ * @param[in] flat_code input flat code
+ * @param[out] block block of memory to write interleaved codes to
+ * @param[in] dim dimension of the flat code
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset how many records to skip before writing the data into the list
+ */
+void pack_1(const float* flat_code, float* block, uint32_t dim, uint32_t veclen, uint32_t offset);
+
+/**
+ * Write one flat code into a block by the given offset. The offset indicates the id of the record
+ * in the list. This function interleaves the code and is intended to later copy the interleaved
+ * codes over to the IVF list on device. NB: no memory allocation happens here; the block must fit
+ * the record (offset + 1).
+ *
+ * @param[in] flat_code input flat code
+ * @param[out] block block of memory to write interleaved codes to
+ * @param[in] dim dimension of the flat code
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset how many records to skip before writing the data into the list
+ */
+void pack_1(const int8_t* flat_code, int8_t* block, uint32_t dim, uint32_t veclen, uint32_t offset);
+
+/**
+ * Write one flat code into a block by the given offset. The offset indicates the id of the record
+ * in the list. This function interleaves the code and is intended to later copy the interleaved
+ * codes over to the IVF list on device. NB: no memory allocation happens here; the block must fit
+ * the record (offset + 1).
+ *
+ * @param[in] flat_code input flat code
+ * @param[out] block block of memory to write interleaved codes to
+ * @param[in] dim dimension of the flat code
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset how many records to skip before writing the data into the list
+ */
+void pack_1(
+  const uint8_t* flat_code, uint8_t* block, uint32_t dim, uint32_t veclen, uint32_t offset);
+
+/**
+ * Unpack 1 record of a single list (cluster) in the index to fetch the flat code. The offset
+ * indicates the id of the record. This function fetches one flat code from an interleaved code.
+ *
+ * @param[in] block interleaved block. The block can be thought of as the whole inverted list in
+ * interleaved format.
+ * @param[out] flat_code output flat code
+ * @param[in] dim dimension of the flat code
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset fetch the flat code by the given offset
+ */
+void unpack_1(const float* block, float* flat_code, uint32_t dim, uint32_t veclen, uint32_t offset);
+
+/**
+ * Unpack 1 record of a single list (cluster) in the index to fetch the flat code. The offset
+ * indicates the id of the record. This function fetches one flat code from an interleaved code.
+ *
+ * @param[in] block interleaved block. The block can be thought of as the whole inverted list in
+ * interleaved format.
+ * @param[out] flat_code output flat code
+ * @param[in] dim dimension of the flat code
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset fetch the flat code by the given offset
+ */
+void unpack_1(
+  const int8_t* block, int8_t* flat_code, uint32_t dim, uint32_t veclen, uint32_t offset);
+
+/**
+ * Unpack 1 record of a single list (cluster) in the index to fetch the flat code. The offset
+ * indicates the id of the record. This function fetches one flat code from an interleaved code.
+ *
+ * @param[in] block interleaved block. The block can be thought of as the whole inverted list in
+ * interleaved format.
+ * @param[out] flat_code output flat code
+ * @param[in] dim dimension of the flat code
+ * @param[in] veclen size of interleaved data chunks
+ * @param[in] offset fetch the flat code by the given offset
+ */
+void unpack_1(
+  const uint8_t* block, uint8_t* flat_code, uint32_t dim, uint32_t veclen, uint32_t offset);
+
+}  // namespace codepacker
+
+/**
+ * @brief Public helper API to reset the data and indices ptrs, and the list sizes. Useful for
+ * externally modifying the index without going through the build stage. The data and indices of the
+ * IVF lists will be lost.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   raft::resources res;
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // initialize an empty index
+ *   ivf_flat::index<float, int64_t> index(res, index_params, D);
+ *   // reset the index's state and list sizes
+ *   ivf_flat::helpers::reset_index(res, &index);
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[inout] index pointer to IVF-Flat index
+ */
+void reset_index(const raft::resources& res, index<float, int64_t>* index);
+
+/**
+ * @brief Public helper API to reset the data and indices ptrs, and the list sizes. Useful for
+ * externally modifying the index without going through the build stage. The data and indices of the
+ * IVF lists will be lost.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   raft::resources res;
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // initialize an empty index
+ *   ivf_flat::index<int8_t, int64_t> index(res, index_params, D);
+ *   // reset the index's state and list sizes
+ *   ivf_flat::helpers::reset_index(res, &index);
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[inout] index pointer to IVF-Flat index
+ */
+void reset_index(const raft::resources& res, index<int8_t, int64_t>* index);
+
+/**
+ * @brief Public helper API to reset the data and indices ptrs, and the list sizes. Useful for
+ * externally modifying the index without going through the build stage. The data and indices of the
+ * IVF lists will be lost.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   raft::resources res;
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // initialize an empty index
+ *   ivf_flat::index<uint8_t, int64_t> index(res, index_params, D);
+ *   // reset the index's state and list sizes
+ *   ivf_flat::helpers::reset_index(res, &index);
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[inout] index pointer to IVF-Flat index
+ */
+void reset_index(const raft::resources& res, index<uint8_t, int64_t>* index);
+
+/**
+ * @}
+ */
+
+}  // namespace helpers
 }  // namespace cuvs::neighbors::ivf_flat
\ No newline at end of file
diff --git a/cpp/src/core/nvtx.hpp b/cpp/src/core/nvtx.hpp
index 79382bbd0..2580b4964 100644
--- a/cpp/src/core/nvtx.hpp
+++ b/cpp/src/core/nvtx.hpp
@@ -16,9 +16,90 @@
 
 #pragma once
 
-namespace cuvs::common::nvtx::domain {
+#include <raft/common/nvtx.hpp>
+
+namespace cuvs::common::nvtx {
+namespace domain {
+/** @brief The default NVTX domain. */
+struct app {
+  static constexpr char const* name{"application"};
+};
+
 /** @brief This NVTX domain is supposed to be used within cuvs.  */
 struct cuvs {
   static constexpr const char* name = "cuvs";
 };
-};  // namespace cuvs::common::nvtx::domain
+}  // namespace domain
+
+/**
+ * @brief Push a named NVTX range.
+ *
+ * @tparam Domain optional struct that defines the NVTX domain message;
+ *   You can create a new domain with a custom message as follows:
+ *   \code{.cpp}
+ *      struct custom_domain { static constexpr char const* name{"custom message"}; }
+ *   \endcode
+ *   NB: make sure to use the same domain for `push_range` and `pop_range`.
+ * @param format range name format (accepts printf-style arguments)
+ * @param args the arguments for the printf-style formatting
+ */
+template <typename Domain = domain::app, typename... Args>
+inline void push_range(const char* format, Args... args)
+{
+  raft::common::nvtx::detail::push_range<Domain, Args...>(format, args...);
+}
+
+/**
+ * @brief Pop the latest range.
+ *
+ * @tparam Domain optional struct that defines the NVTX domain message;
+ *   You can create a new domain with a custom message as follows:
+ *   \code{.cpp}
+ *      struct custom_domain { static constexpr char const* name{"custom message"}; }
+ *   \endcode
+ *   NB: make sure to use the same domain for `push_range` and `pop_range`.
+ */
+template <typename Domain = domain::app>
+inline void pop_range()
+{
+  raft::common::nvtx::detail::pop_range<Domain>();
+}
+
+/**
+ * @brief Push a named NVTX range that would be popped at the end of the object lifetime.
+ *
+ * Refer to \ref Usage for the usage examples.
+ *
+ * @tparam Domain optional struct that defines the NVTX domain message;
+ *   You can create a new domain with a custom message as follows:
+ *   \code{.cpp}
+ *      struct custom_domain { static constexpr char const* name{"custom message"}; }
+ *   \endcode
+ */
+template <typename Domain = domain::app>
+class range {
+ public:
+  /**
+   * Push a named NVTX range.
+   * At the end of the object lifetime, pop the range back.
+   *
+   * @param format range name format (accepts printf-style arguments)
+   * @param args the arguments for the printf-style formatting
+   */
+  template <typename... Args>
+  explicit range(const char* format, Args... args)
+  {
+    push_range<Domain, Args...>(format, args...);
+  }
+
+  ~range() { pop_range<Domain>(); }
+
+  /* This object is not meant to be touched. */
+  range(const range&)                              = delete;
+  range(range&&)                                   = delete;
+  auto operator=(const range&) -> range&           = delete;
+  auto operator=(range&&) -> range&                = delete;
+  static auto operator new(std::size_t) -> void*   = delete;
+  static auto operator new[](std::size_t) -> void* = delete;
+};
+};  // namespace cuvs::common::nvtx
diff --git a/cpp/src/neighbors/detail/div_utils.cuh b/cpp/src/neighbors/detail/div_utils.cuh
new file mode 100644
index 000000000..b54b8400e
--- /dev/null
+++ b/cpp/src/neighbors/detail/div_utils.cuh
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/util/pow2_utils.cuh>
+
+/**
+ * @brief A simple wrapper for raft::Pow2 which uses Pow2 utils only when available and regular
+ * integer division otherwise. This is done to allow a common interface for division arithmetic for
+ * non CUDA headers.
+ *
+ * @tparam Value_ a compile-time value representable as a power-of-two.
+ */
+namespace cuvs::neighbors::detail {
+template <auto Value_>
+struct div_utils {
+  typedef decltype(Value_) Type;
+  static constexpr Type Value = Value_;
+
+  template <typename T>
+  static constexpr _RAFT_HOST_DEVICE inline auto roundDown(T x)
+  {
+    return raft::Pow2<Value_>::roundDown(x);
+  }
+
+  template <typename T>
+  static constexpr _RAFT_HOST_DEVICE inline auto mod(T x)
+  {
+    return raft::Pow2<Value_>::mod(x);
+  }
+
+  template <typename T>
+  static constexpr _RAFT_HOST_DEVICE inline auto div(T x)
+  {
+    return raft::Pow2<Value_>::div(x);
+  }
+};
+}  // namespace cuvs::neighbors::detail
diff --git a/cpp/src/neighbors/ivf_flat/generate_ivf_flat.py b/cpp/src/neighbors/ivf_flat/generate_ivf_flat.py
index bf0cad6d4..1733ca8b2 100644
--- a/cpp/src/neighbors/ivf_flat/generate_ivf_flat.py
+++ b/cpp/src/neighbors/ivf_flat/generate_ivf_flat.py
@@ -37,9 +37,21 @@
  *
  */
 
-#include <raft_runtime/neighbors/ivf_flat.hpp>
 #include <cuvs/neighbors/ivf_flat.hpp>
+"""
+
+build_include_macro = """
+#include "ivf_flat_build.cuh"
+"""
+search_include_macro = """
+#include "ivf_flat_search.cuh"
+"""
+
+serialize_include_macro = """
+#include "ivf_flat_serialize.cuh"
+"""
 
+namespace_macro = """
 namespace cuvs::neighbors::ivf_flat {
 """
 
@@ -54,14 +66,14 @@
 )
 
 build_macro = """
-#define CUVS_INST_IVF_FLAT_BUILD(T, IdxT)                                                      \\
+#define CUVS_INST_IVF_FLAT_BUILD_EXTEND(T, IdxT)                                               \\
   auto build(raft::resources const& handle,                                                    \\
              const cuvs::neighbors::ivf_flat::index_params& params,                            \\
              raft::device_matrix_view<const T, IdxT, raft::row_major> dataset)                 \\
     ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                \\
   {                                                                                            \\
     return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                          \\
-      std::move(raft::runtime::neighbors::ivf_flat::build(handle, params, dataset)));          \\
+      std::move(cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset)));           \\
   }                                                                                            \\
                                                                                                \\
   void build(raft::resources const& handle,                                                    \\
@@ -69,12 +81,24 @@
              raft::device_matrix_view<const T, IdxT, raft::row_major> dataset,                 \\
              cuvs::neighbors::ivf_flat::index<T, IdxT>& idx)                                   \\
   {                                                                                            \\
-    raft::runtime::neighbors::ivf_flat::build(handle, params, dataset, *idx.get_raft_index()); \\
-  }
-"""
-
-extend_macro = """
-#define CUVS_INST_IVF_FLAT_EXTEND(T, IdxT)                                           \\
+    cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset, idx);                    \\
+  }                                                                                            \\
+  auto build(raft::resources const& handle,                                                    \\
+             const cuvs::neighbors::ivf_flat::index_params& params,                            \\
+             raft::host_matrix_view<const T, IdxT, raft::row_major> dataset)                   \\
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                \\
+  {                                                                                            \\
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                          \\
+      std::move(cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset)));           \\
+  }                                                                                            \\
+                                                                                               \\
+  void build(raft::resources const& handle,                                                    \\
+             const cuvs::neighbors::ivf_flat::index_params& params,                            \\
+             raft::host_matrix_view<const T, IdxT, raft::row_major> dataset,                   \\
+             cuvs::neighbors::ivf_flat::index<T, IdxT>& idx)                                   \\
+  {                                                                                            \\
+    cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset, idx);                    \\
+  }                                                                                            \\
   auto extend(raft::resources const& handle,                                         \\
               raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,  \\
               std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \\
@@ -82,8 +106,8 @@
     ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                      \\
   {                                                                                  \\
     return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                \\
-      std::move(raft::runtime::neighbors::ivf_flat::extend(                          \\
-        handle, new_vectors, new_indices, *orig_index.get_raft_index())));           \\
+      std::move(cuvs::neighbors::ivf_flat::detail::extend(                           \\
+        handle, new_vectors, new_indices, orig_index)));                             \\
   }                                                                                  \\
                                                                                      \\
   void extend(raft::resources const& handle,                                         \\
@@ -91,9 +115,28 @@
               std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \\
               cuvs::neighbors::ivf_flat::index<T, IdxT>* idx)                        \\
   {                                                                                  \\
-    raft::runtime::neighbors::ivf_flat::extend(                                      \\
-      handle, new_vectors, new_indices, idx->get_raft_index());                      \\
-  }                 
+    cuvs::neighbors::ivf_flat::detail::extend(                                       \\
+      handle, new_vectors, new_indices, idx);                                        \\
+  }                                                                                  \\
+  auto extend(raft::resources const& handle,                                         \\
+              raft::host_matrix_view<const T, IdxT, raft::row_major> new_vectors,    \\
+              std::optional<raft::host_vector_view<const IdxT, IdxT>> new_indices,   \\
+              const cuvs::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \\
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                      \\
+  {                                                                                  \\
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                \\
+      std::move(cuvs::neighbors::ivf_flat::detail::extend(                           \\
+        handle, new_vectors, new_indices, orig_index)));                             \\
+  }                                                                                  \\
+                                                                                     \\
+  void extend(raft::resources const& handle,                                         \\
+              raft::host_matrix_view<const T, IdxT, raft::row_major> new_vectors,    \\
+              std::optional<raft::host_vector_view<const IdxT, IdxT>> new_indices,   \\
+              cuvs::neighbors::ivf_flat::index<T, IdxT>* idx)                        \\
+  {                                                                                  \\
+    cuvs::neighbors::ivf_flat::detail::extend(                                       \\
+      handle, new_vectors, new_indices, idx);                                        \\
+  }                    
 """
 
 search_macro = """
@@ -105,8 +148,20 @@
               raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,  \\
               raft::device_matrix_view<float, IdxT, raft::row_major> distances) \\
   {                                                                             \\
-    raft::runtime::neighbors::ivf_flat::search(                                 \\
-      handle, params, *index.get_raft_index(), queries, neighbors, distances);  \\
+    cuvs::neighbors::ivf_flat::detail::search(                                  \\
+      handle, params, index, queries, neighbors, distances);                    \\
+  }                                                                             \\
+  void search_with_filtering(                                                   \\
+    raft::resources const& handle,                                              \\
+    const search_params& params,                                                \\
+    index<T, IdxT>& idx,                                                        \\
+    raft::device_matrix_view<const T, IdxT, raft::row_major> queries,           \\
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,            \\
+    raft::device_matrix_view<float, IdxT, raft::row_major> distances,           \\
+    cuvs::neighbors::filtering::bitset_filter<uint32_t, IdxT> sample_filter)    \\
+  {                                                                             \\
+    cuvs::neighbors::ivf_flat::detail::search_with_filtering(                   \\
+      handle, params, idx, queries, neighbors, distances, sample_filter);       \\
   }
 """
 
@@ -116,46 +171,48 @@
                       const std::string& filename,                                                 \\
                       const cuvs::neighbors::ivf_flat::index<T, IdxT>& index)                      \\
   {                                                                                                \\
-    raft::runtime::neighbors::ivf_flat::serialize_file(handle, filename, *index.get_raft_index()); \\
-  }                                                                                                \\
-                                                                                                   \\
-  void deserialize_file(raft::resources const& handle,                                             \\
-                        const std::string& filename,                                               \\
-                        cuvs::neighbors::ivf_flat::index<T, IdxT>* index)                          \\
-  {                                                                                                \\
-    raft::runtime::neighbors::ivf_flat::deserialize_file(                                          \\
-      handle, filename, index->get_raft_index());                                                  \\
+    cuvs::neighbors::ivf_flat::detail::serialize(handle, filename, index);                         \\
   }                                                                                                \\
                                                                                                    \\
   void serialize(raft::resources const& handle,                                                    \\
                  std::string& str,                                                                 \\
                  const cuvs::neighbors::ivf_flat::index<T, IdxT>& index)                           \\
   {                                                                                                \\
-    raft::runtime::neighbors::ivf_flat::serialize(handle, str, *index.get_raft_index());           \\
+    std::ostringstream os;                                                                         \\
+    cuvs::neighbors::ivf_flat::detail::serialize(handle, os, index);                               \\
+    str = os.str();                                                                                \\
   }                                                                                                \\
                                                                                                    \\
-  void deserialize(raft::resources const& handle,                                                  \\
-                   const std::string& str,                                                         \\
-                   cuvs::neighbors::ivf_flat::index<T, IdxT>* index)                               \\
+  void deserialize_file(raft::resources const& handle,                                             \\
+                        const std::string& filename,                                               \\
+                        cuvs::neighbors::ivf_flat::index<T, IdxT>* index)                          \\
   {                                                                                                \\
-    raft::runtime::neighbors::ivf_flat::deserialize(handle, str, index->get_raft_index());         \\
-  }
+    * index = cuvs::neighbors::ivf_flat::detail::deserialize<T, IdxT>(                             \\
+      handle, filename);                                                                           \\
+  }                                                                                                \\
+    void deserialize(raft::resources const& handle,                                                \\
+                     const std::string& str,                                                       \\
+                     cuvs::neighbors::ivf_flat::index<T, IdxT>* index)                             \\
+  {                                                                                                \\
+    std::istringstream is(str);                                                                    \\
+    * index = cuvs::neighbors::ivf_flat::detail::deserialize<T, IdxT>(                             \\
+      handle, is);                                                                                 \\
+  }                                                                                                 
 """
 
 macros = dict(
-    build=dict(
+    build_extend=dict(
+        include=build_include_macro,
         definition=build_macro,
-        name="CUVS_INST_IVF_FLAT_BUILD",
-    ),
-    extend=dict(
-        definition=extend_macro,
-        name="CUVS_INST_IVF_FLAT_EXTEND",
+        name="CUVS_INST_IVF_FLAT_BUILD_EXTEND",
     ),
     search=dict(
+        include=search_include_macro,
         definition=search_macro,
         name="CUVS_INST_IVF_FLAT_SEARCH",
     ),
     serialize=dict(
+        include=serialize_include_macro,
         definition=serialize_macro,
         name="CUVS_INST_IVF_FLAT_SERIALIZE",
     ),
@@ -163,9 +220,11 @@
 
 for type_path, (T, IdxT) in types.items():
     for macro_path, macro in macros.items():
-        path = f"ivf_flat_{macro_path}_{type_path}.cpp"
+        path = f"ivf_flat_{macro_path}_{type_path}.cu"
         with open(path, "w") as f:
             f.write(header)
+            f.write(macro['include'])
+            f.write(namespace_macro)
             f.write(macro["definition"])
             f.write(f"{macro['name']}({T}, {IdxT});\n\n")
             f.write(f"#undef {macro['name']}\n")
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh
new file mode 100644
index 000000000..89cd5ec57
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh
@@ -0,0 +1,601 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../../core/nvtx.hpp"
+#include "../ivf_common.cuh"
+#include "../ivf_list.cuh"
+
+#include <cuvs/neighbors/ivf_flat.hpp>
+#include <cuvs/neighbors/ivf_list.hpp>
+
+#include "../detail/ann_utils.cuh"
+#include <raft/cluster/kmeans_balanced.cuh>
+#include <raft/core/logger.hpp>
+#include <raft/core/mdarray.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/linalg/add.cuh>
+#include <raft/linalg/map.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/stats/histogram.cuh>
+#include <raft/util/pow2_utils.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cstdint>
+
+namespace cuvs::neighbors::ivf_flat {
+using namespace cuvs::spatial::knn::detail;  // NOLINT
+
+namespace detail {
+
+template <typename T, typename IdxT>
+auto clone(const raft::resources& res, const index<T, IdxT>& source) -> index<T, IdxT>
+{
+  auto stream = raft::resource::get_cuda_stream(res);
+
+  // Allocate the new index
+  index<T, IdxT> target(res,
+                        source.metric(),
+                        source.n_lists(),
+                        source.adaptive_centers(),
+                        source.conservative_memory_allocation(),
+                        source.dim());
+
+  // Copy the independent parts
+  raft::copy(target.list_sizes().data_handle(),
+             source.list_sizes().data_handle(),
+             source.list_sizes().size(),
+             stream);
+  raft::copy(target.centers().data_handle(),
+             source.centers().data_handle(),
+             source.centers().size(),
+             stream);
+  if (source.center_norms().has_value()) {
+    target.allocate_center_norms(res);
+    raft::copy(target.center_norms()->data_handle(),
+               source.center_norms()->data_handle(),
+               source.center_norms()->size(),
+               stream);
+  }
+  // Copy shared pointers
+  target.lists() = source.lists();
+
+  // Make sure the device pointers point to the new lists
+  ivf::detail::recompute_internal_state(res, target);
+
+  return target;
+}
+
+/**
+ * @brief Record the dataset into the index, one source row at a time.
+ *
+ * The index consists of the dataset rows, grouped by their labels (into clusters/lists).
+ * Within each cluster (list), the data is grouped into blocks of `WarpSize` interleaved
+ * vectors. Note, the total index length is slightly larger than the dataset length, because
+ * each cluster is padded by `WarpSize` elements
+ *
+ * CUDA launch grid:
+ *   X dimension must cover the dataset (n_rows), YZ are not used;
+ *   there are no dependencies between threads, hence no constraints on the block size.
+ *
+ * @tparam T      element type.
+ * @tparam IdxT   type of the indices in the source source_vecs
+ * @tparam LabelT label type
+ * @tparam gather_src if false, then we build the index from vectors source_vecs[i,:], otherwise
+ *     we use source_vecs[source_ixs[i],:]. In both cases i=0..n_rows-1.
+ *
+ * @param[in] labels device pointer to the cluster ids for each row [n_rows]
+ * @param[in] source_vecs device pointer to the input data [n_rows, dim]
+ * @param[in] source_ixs device pointer to the input indices [n_rows]
+ * @param[out] list_data_ptrs device pointer to the index data of size [n_lists][index_size, dim]
+ * @param[out] list_index_ptrs device pointer to the source ids corr. to the output [n_lists]
+ * [index_size]
+ * @param[out] list_sizes_ptr device pointer to the cluster sizes [n_lists];
+ *                          it's used as an atomic counter, and must be initialized with zeros.
+ * @param n_rows source length
+ * @param dim the dimensionality of the data
+ * @param veclen size of vectorized loads/stores; must satisfy `dim % veclen == 0`.
+ *
+ */
+template <typename T, typename IdxT, typename LabelT, bool gather_src = false>
+RAFT_KERNEL build_index_kernel(const LabelT* labels,
+                               const T* source_vecs,
+                               const IdxT* source_ixs,
+                               T** list_data_ptrs,
+                               IdxT** list_index_ptrs,
+                               uint32_t* list_sizes_ptr,
+                               IdxT n_rows,
+                               uint32_t dim,
+                               uint32_t veclen,
+                               IdxT batch_offset = 0)
+{
+  const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x;
+  if (i >= n_rows) { return; }
+
+  auto list_id     = labels[i];
+  auto inlist_id   = atomicAdd(list_sizes_ptr + list_id, 1);
+  auto* list_index = list_index_ptrs[list_id];
+  auto* list_data  = list_data_ptrs[list_id];
+
+  // Record the source vector id in the index
+  list_index[inlist_id] = source_ixs == nullptr ? i + batch_offset : source_ixs[i];
+
+  // The data is written in interleaved groups of `index::kGroupSize` vectors
+  using interleaved_group = raft::Pow2<kIndexGroupSize>;
+  auto group_offset       = interleaved_group::roundDown(inlist_id);
+  auto ingroup_id         = interleaved_group::mod(inlist_id) * veclen;
+
+  // Point to the location of the interleaved group of vectors
+  list_data += group_offset * dim;
+
+  // Point to the source vector
+  if constexpr (gather_src) {
+    source_vecs += source_ixs[i] * dim;
+  } else {
+    source_vecs += i * dim;
+  }
+  // Interleave dimensions of the source vector while recording it.
+  // NB: such `veclen` is selected, that `dim % veclen == 0`
+  for (uint32_t l = 0; l < dim; l += veclen) {
+    for (uint32_t j = 0; j < veclen; j++) {
+      list_data[l * kIndexGroupSize + ingroup_id + j] = source_vecs[l + j];
+    }
+  }
+}
+
+/** See raft::neighbors::ivf_flat::extend docs */
+template <typename T, typename IdxT>
+void extend(raft::resources const& handle,
+            index<T, IdxT>* index,
+            const T* new_vectors,
+            const IdxT* new_indices,
+            IdxT n_rows)
+{
+  using LabelT = uint32_t;
+  RAFT_EXPECTS(index != nullptr, "index cannot be empty.");
+
+  auto stream  = raft::resource::get_cuda_stream(handle);
+  auto n_lists = index->n_lists();
+  auto dim     = index->dim();
+  list_spec<uint32_t, T, IdxT> list_device_spec{index->dim(),
+                                                index->conservative_memory_allocation()};
+  cuvs::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
+    "ivf_flat::extend(%zu, %u)", size_t(n_rows), dim);
+
+  RAFT_EXPECTS(new_indices != nullptr || index->size() == 0,
+               "You must pass data indices when the index is non-empty.");
+
+  auto new_labels = raft::make_device_vector<LabelT, IdxT>(handle, n_rows);
+  raft::cluster::kmeans_balanced_params kmeans_params;
+  kmeans_params.metric = static_cast<raft::distance::DistanceType>(index->metric());
+  auto orig_centroids_view =
+    raft::make_device_matrix_view<const float, IdxT>(index->centers().data_handle(), n_lists, dim);
+  // Calculate the batch size for the input data if it's not accessible directly from the device
+  constexpr size_t kReasonableMaxBatchSize = 65536;
+  size_t max_batch_size                    = std::min<size_t>(n_rows, kReasonableMaxBatchSize);
+
+  // Predict the cluster labels for the new data, in batches if necessary
+  utils::batch_load_iterator<T> vec_batches(new_vectors,
+                                            n_rows,
+                                            index->dim(),
+                                            max_batch_size,
+                                            stream,
+                                            raft::resource::get_workspace_resource(handle));
+
+  for (const auto& batch : vec_batches) {
+    auto batch_data_view =
+      raft::make_device_matrix_view<const T, IdxT>(batch.data(), batch.size(), index->dim());
+    auto batch_labels_view = raft::make_device_vector_view<LabelT, IdxT>(
+      new_labels.data_handle() + batch.offset(), batch.size());
+    raft::cluster::kmeans_balanced::predict(handle,
+                                            kmeans_params,
+                                            batch_data_view,
+                                            orig_centroids_view,
+                                            batch_labels_view,
+                                            utils::mapping<float>{});
+  }
+
+  auto* list_sizes_ptr    = index->list_sizes().data_handle();
+  auto old_list_sizes_dev = raft::make_device_vector<uint32_t, IdxT>(handle, n_lists);
+  raft::copy(old_list_sizes_dev.data_handle(), list_sizes_ptr, n_lists, stream);
+
+  // Calculate the centers and sizes on the new data, starting from the original values
+  if (index->adaptive_centers()) {
+    auto centroids_view = raft::make_device_matrix_view<float, IdxT>(
+      index->centers().data_handle(), index->centers().extent(0), index->centers().extent(1));
+    auto list_sizes_view =
+      raft::make_device_vector_view<std::remove_pointer_t<decltype(list_sizes_ptr)>, IdxT>(
+        list_sizes_ptr, n_lists);
+    for (const auto& batch : vec_batches) {
+      auto batch_data_view =
+        raft::make_device_matrix_view<const T, IdxT>(batch.data(), batch.size(), index->dim());
+      auto batch_labels_view = raft::make_device_vector_view<const LabelT, IdxT>(
+        new_labels.data_handle() + batch.offset(), batch.size());
+      raft::cluster::kmeans_balanced::helpers::calc_centers_and_sizes(handle,
+                                                                      batch_data_view,
+                                                                      batch_labels_view,
+                                                                      centroids_view,
+                                                                      list_sizes_view,
+                                                                      false,
+                                                                      utils::mapping<float>{});
+    }
+  } else {
+    raft::stats::histogram<uint32_t, IdxT>(raft::stats::HistTypeAuto,
+                                           reinterpret_cast<int32_t*>(list_sizes_ptr),
+                                           IdxT(n_lists),
+                                           new_labels.data_handle(),
+                                           n_rows,
+                                           1,
+                                           stream);
+    raft::linalg::add(
+      list_sizes_ptr, list_sizes_ptr, old_list_sizes_dev.data_handle(), n_lists, stream);
+  }
+
+  // Calculate and allocate new list data
+  std::vector<uint32_t> new_list_sizes(n_lists);
+  std::vector<uint32_t> old_list_sizes(n_lists);
+  {
+    raft::copy(old_list_sizes.data(), old_list_sizes_dev.data_handle(), n_lists, stream);
+    raft::copy(new_list_sizes.data(), list_sizes_ptr, n_lists, stream);
+    raft::resource::sync_stream(handle);
+    auto& lists = index->lists();
+    for (uint32_t label = 0; label < n_lists; label++) {
+      ivf::resize_list(handle,
+                       lists[label],
+                       list_device_spec,
+                       new_list_sizes[label],
+                       raft::Pow2<kIndexGroupSize>::roundUp(old_list_sizes[label]));
+    }
+  }
+  // Update the pointers and the sizes
+  ivf::detail::recompute_internal_state(handle, *index);
+  // Copy the old sizes, so we can start from the current state of the index;
+  // we'll rebuild the `list_sizes_ptr` in the following kernel, using it as an atomic counter.
+  raft::copy(list_sizes_ptr, old_list_sizes_dev.data_handle(), n_lists, stream);
+
+  utils::batch_load_iterator<IdxT> vec_indices(
+    new_indices, n_rows, 1, max_batch_size, stream, raft::resource::get_workspace_resource(handle));
+  utils::batch_load_iterator<IdxT> idx_batch = vec_indices.begin();
+  size_t next_report_offset                  = 0;
+  size_t d_report_offset                     = n_rows * 5 / 100;
+  for (const auto& batch : vec_batches) {
+    auto batch_data_view =
+      raft::make_device_matrix_view<const T, IdxT>(batch.data(), batch.size(), index->dim());
+    // Kernel to insert the new vectors
+    const dim3 block_dim(256);
+    const dim3 grid_dim(raft::ceildiv<IdxT>(batch.size(), block_dim.x));
+    build_index_kernel<T, IdxT, LabelT>
+      <<<grid_dim, block_dim, 0, stream>>>(new_labels.data_handle() + batch.offset(),
+                                           batch_data_view.data_handle(),
+                                           idx_batch->data(),
+                                           index->data_ptrs().data_handle(),
+                                           index->inds_ptrs().data_handle(),
+                                           list_sizes_ptr,
+                                           batch.size(),
+                                           dim,
+                                           index->veclen(),
+                                           batch.offset());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+    if (batch.offset() > next_report_offset) {
+      float progress = batch.offset() * 100.0f / n_rows;
+      RAFT_LOG_DEBUG("ivf_flat::extend added vectors %zu, %6.1f%% complete",
+                     static_cast<size_t>(batch.offset()),
+                     progress);
+      next_report_offset += d_report_offset;
+    }
+    ++idx_batch;
+  }
+  // Precompute the centers vector norms for L2Expanded distance
+  if (!index->center_norms().has_value()) {
+    index->allocate_center_norms(handle);
+    if (index->center_norms().has_value()) {
+      raft::linalg::rowNorm(index->center_norms()->data_handle(),
+                            index->centers().data_handle(),
+                            dim,
+                            n_lists,
+                            raft::linalg::L2Norm,
+                            true,
+                            stream);
+      RAFT_LOG_TRACE_VEC(index->center_norms()->data_handle(), std::min<uint32_t>(dim, 20));
+    }
+  } else if (index->center_norms().has_value() && index->adaptive_centers()) {
+    raft::linalg::rowNorm(index->center_norms()->data_handle(),
+                          index->centers().data_handle(),
+                          dim,
+                          n_lists,
+                          raft::linalg::L2Norm,
+                          true,
+                          stream);
+    RAFT_LOG_TRACE_VEC(index->center_norms()->data_handle(), std::min<uint32_t>(dim, 20));
+  }
+}
+
+/** See raft::neighbors::ivf_flat::extend docs */
+template <typename T, typename IdxT>
+auto extend(raft::resources const& handle,
+            const index<T, IdxT>& orig_index,
+            const T* new_vectors,
+            const IdxT* new_indices,
+            IdxT n_rows) -> index<T, IdxT>
+{
+  auto ext_index = clone(handle, orig_index);
+  detail::extend(handle, &ext_index, new_vectors, new_indices, n_rows);
+  return ext_index;
+}
+
+/** See raft::neighbors::ivf_flat::build docs */
+template <typename T, typename IdxT>
+inline auto build(raft::resources const& handle,
+                  const index_params& params,
+                  const T* dataset,
+                  IdxT n_rows,
+                  uint32_t dim) -> index<T, IdxT>
+{
+  auto stream = raft::resource::get_cuda_stream(handle);
+  cuvs::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
+    "ivf_flat::build(%zu, %u)", size_t(n_rows), dim);
+  static_assert(std::is_same_v<T, float> || std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
+                "unsupported data type");
+  RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset");
+  RAFT_EXPECTS(n_rows >= params.n_lists, "number of rows can't be less than n_lists");
+
+  index<T, IdxT> index(handle, params, dim);
+  utils::memzero(
+    index.accum_sorted_sizes().data_handle(), index.accum_sorted_sizes().size(), stream);
+  utils::memzero(index.list_sizes().data_handle(), index.list_sizes().size(), stream);
+  utils::memzero(index.data_ptrs().data_handle(), index.data_ptrs().size(), stream);
+  utils::memzero(index.inds_ptrs().data_handle(), index.inds_ptrs().size(), stream);
+
+  // Train the kmeans clustering
+  {
+    auto trainset_ratio = std::max<size_t>(
+      1, n_rows / std::max<size_t>(params.kmeans_trainset_fraction * n_rows, index.n_lists()));
+    auto n_rows_train = n_rows / trainset_ratio;
+    rmm::device_uvector<T> trainset(n_rows_train * index.dim(), stream);
+    // TODO: a proper sampling
+    RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(),
+                                    sizeof(T) * index.dim(),
+                                    dataset,
+                                    sizeof(T) * index.dim() * trainset_ratio,
+                                    sizeof(T) * index.dim(),
+                                    n_rows_train,
+                                    cudaMemcpyDefault,
+                                    stream));
+    auto trainset_const_view =
+      raft::make_device_matrix_view<const T, IdxT>(trainset.data(), n_rows_train, index.dim());
+    auto centers_view = raft::make_device_matrix_view<float, IdxT>(
+      index.centers().data_handle(), index.n_lists(), index.dim());
+    raft::cluster::kmeans_balanced_params kmeans_params;
+    kmeans_params.n_iters = params.kmeans_n_iters;
+    kmeans_params.metric  = static_cast<raft::distance::DistanceType>(index.metric());
+    raft::cluster::kmeans_balanced::fit(
+      handle, kmeans_params, trainset_const_view, centers_view, utils::mapping<float>{});
+  }
+
+  // add the data if necessary
+  if (params.add_data_on_build) {
+    detail::extend<T, IdxT>(handle, &index, dataset, nullptr, n_rows);
+  }
+  return index;
+}
+
+/**
+ * Build an index that can be used in refinement operation.
+ *
+ * See raft::neighbors::refine for details on the refinement operation.
+ *
+ * The returned index cannot be used for a regular ivf_flat::search. The index misses information
+ * about coarse clusters. Instead, the neighbor candidates are assumed to form clusters, one for
+ * each query. The candidate vectors are gathered into the index dataset, that can be later used
+ * in ivfflat_interleaved_scan.
+ *
+ * @param[in] handle the raft handle
+ * @param[inout] refinement_index
+ * @param[in] dataset device pointer to dataset vectors, size [n_rows, dim]. Note that n_rows is
+ *   not known to this function, but each candidate_idx has to be smaller than n_rows.
+ * @param[in] candidate_idx device pointer to neighbor candidates, size [n_queries, n_candidates]
+ * @param[in] n_candidates  of neighbor_candidates
+ */
+template <typename T, typename IdxT>
+inline void fill_refinement_index(raft::resources const& handle,
+                                  index<T, IdxT>* refinement_index,
+                                  const T* dataset,
+                                  const IdxT* candidate_idx,
+                                  IdxT n_queries,
+                                  uint32_t n_candidates)
+{
+  using LabelT = uint32_t;
+
+  auto stream      = raft::resource::get_cuda_stream(handle);
+  uint32_t n_lists = n_queries;
+  common::nvtx::range<common::nvtx::domain::cuvs> fun_scope(
+    "ivf_flat::fill_refinement_index(%zu, %u)", size_t(n_queries));
+
+  rmm::device_uvector<LabelT> new_labels(n_queries * n_candidates, stream);
+  auto new_labels_view =
+    raft::make_device_vector_view<LabelT, IdxT>(new_labels.data(), n_queries * n_candidates);
+  raft::linalg::map_offset(
+    handle,
+    new_labels_view,
+    raft::compose_op(raft::cast_op<LabelT>(), raft::div_const_op<IdxT>(n_candidates)));
+
+  auto list_sizes_ptr = refinement_index->list_sizes().data_handle();
+  // We do not fill centers and center norms, since we will not run coarse search.
+
+  // Allocate new memory
+  auto& lists = refinement_index->lists();
+  list_spec<uint32_t, T, IdxT> list_device_spec{refinement_index->dim(), false};
+  for (uint32_t label = 0; label < n_lists; label++) {
+    ivf::resize_list(handle, lists[label], list_device_spec, n_candidates, uint32_t(0));
+  }
+  // Update the pointers and the sizes
+  ivf::detail::recompute_internal_state(handle, *refinement_index);
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(list_sizes_ptr, 0, n_lists * sizeof(uint32_t), stream));
+
+  const dim3 block_dim(256);
+  const dim3 grid_dim(raft::ceildiv<IdxT>(n_queries * n_candidates, block_dim.x));
+  build_index_kernel<T, IdxT, LabelT, true>
+    <<<grid_dim, block_dim, 0, stream>>>(new_labels.data(),
+                                         dataset,
+                                         candidate_idx,
+                                         refinement_index->data_ptrs().data_handle(),
+                                         refinement_index->inds_ptrs().data_handle(),
+                                         list_sizes_ptr,
+                                         n_queries * n_candidates,
+                                         refinement_index->dim(),
+                                         refinement_index->veclen());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename T, typename IdxT>
+auto build(raft::resources const& handle,
+           const index_params& params,
+           raft::device_matrix_view<const T, IdxT, raft::row_major> dataset) -> index<T, IdxT>
+{
+  IdxT n_rows = dataset.extent(0);
+  IdxT dim    = dataset.extent(1);
+  return build(handle, params, dataset.data_handle(), n_rows, dim);
+}
+
+template <typename T, typename IdxT>
+auto build(raft::resources const& handle,
+           const index_params& params,
+           raft::host_matrix_view<const T, IdxT, raft::row_major> dataset) -> index<T, IdxT>
+{
+  IdxT n_rows = dataset.extent(0);
+  IdxT dim    = dataset.extent(1);
+  return build(handle, params, dataset.data_handle(), n_rows, dim);
+}
+
+template <typename T, typename IdxT>
+void build(raft::resources const& handle,
+           const index_params& params,
+           raft::device_matrix_view<const T, IdxT, raft::row_major> dataset,
+           index<T, IdxT>& index)
+{
+  IdxT n_rows = dataset.extent(0);
+  IdxT dim    = dataset.extent(1);
+  index       = build(handle, params, dataset.data_handle(), n_rows, dim);
+}
+
+template <typename T, typename IdxT>
+void build(raft::resources const& handle,
+           const index_params& params,
+           raft::host_matrix_view<const T, IdxT, raft::row_major> dataset,
+           index<T, IdxT>& index)
+{
+  IdxT n_rows = dataset.extent(0);
+  IdxT dim    = dataset.extent(1);
+  index       = build(handle, params, dataset.data_handle(), n_rows, dim);
+}
+
+template <typename T, typename IdxT>
+auto extend(raft::resources const& handle,
+            raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,
+            std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,
+            const cuvs::neighbors::ivf_flat::index<T, IdxT>& orig_index) -> index<T, IdxT>
+{
+  ASSERT(new_vectors.extent(1) == orig_index.dim(),
+         "new_vectors should have the same dimension as the index");
+
+  IdxT n_rows = new_vectors.extent(0);
+  if (new_indices.has_value()) {
+    ASSERT(n_rows == new_indices.value().extent(0),
+           "new_vectors and new_indices have different number of rows");
+  }
+
+  return extend(handle,
+                orig_index,
+                new_vectors.data_handle(),
+                new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                n_rows);
+}
+
+template <typename T, typename IdxT>
+auto extend(raft::resources const& handle,
+            raft::host_matrix_view<const T, IdxT, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const IdxT, IdxT>> new_indices,
+            const cuvs::neighbors::ivf_flat::index<T, IdxT>& orig_index) -> index<T, IdxT>
+{
+  ASSERT(new_vectors.extent(1) == orig_index.dim(),
+         "new_vectors should have the same dimension as the index");
+
+  IdxT n_rows = new_vectors.extent(0);
+  if (new_indices.has_value()) {
+    ASSERT(n_rows == new_indices.value().extent(0),
+           "new_vectors and new_indices have different number of rows");
+  }
+
+  return extend(handle,
+                orig_index,
+                new_vectors.data_handle(),
+                new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                n_rows);
+}
+
+template <typename T, typename IdxT>
+void extend(raft::resources const& handle,
+            raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,
+            std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,
+            index<T, IdxT>* index)
+{
+  ASSERT(new_vectors.extent(1) == index->dim(),
+         "new_vectors should have the same dimension as the index");
+
+  IdxT n_rows = new_vectors.extent(0);
+  if (new_indices.has_value()) {
+    ASSERT(n_rows == new_indices.value().extent(0),
+           "new_vectors and new_indices have different number of rows");
+  }
+
+  *index = extend(handle,
+                  *index,
+                  new_vectors.data_handle(),
+                  new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                  n_rows);
+}
+
+template <typename T, typename IdxT>
+void extend(raft::resources const& handle,
+            raft::host_matrix_view<const T, IdxT, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const IdxT, IdxT>> new_indices,
+            index<T, IdxT>* index)
+{
+  ASSERT(new_vectors.extent(1) == index->dim(),
+         "new_vectors should have the same dimension as the index");
+
+  IdxT n_rows = new_vectors.extent(0);
+  if (new_indices.has_value()) {
+    ASSERT(n_rows == new_indices.value().extent(0),
+           "new_vectors and new_indices have different number of rows");
+  }
+
+  *index = extend(handle,
+                  *index,
+                  new_vectors.data_handle(),
+                  new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                  n_rows);
+}
+
+}  // namespace detail
+}  // namespace cuvs::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_build_extend_float_int64_t.cu b/cpp/src/neighbors/ivf_flat/ivf_flat_build_extend_float_int64_t.cu
new file mode 100644
index 000000000..52026172c
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_build_extend_float_int64_t.cu
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_ivf_flat.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_ivf_flat.py
+ *
+ */
+
+#include <cuvs/neighbors/ivf_flat.hpp>
+
+#include "ivf_flat_build.cuh"
+
+namespace cuvs::neighbors::ivf_flat {
+
+#define CUVS_INST_IVF_FLAT_BUILD_EXTEND(T, IdxT)                                                 \
+  auto build(raft::resources const& handle,                                                      \
+             const cuvs::neighbors::ivf_flat::index_params& params,                              \
+             raft::device_matrix_view<const T, IdxT, raft::row_major> dataset)                   \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                  \
+  {                                                                                              \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                            \
+      std::move(cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset)));             \
+  }                                                                                              \
+                                                                                                 \
+  void build(raft::resources const& handle,                                                      \
+             const cuvs::neighbors::ivf_flat::index_params& params,                              \
+             raft::device_matrix_view<const T, IdxT, raft::row_major> dataset,                   \
+             cuvs::neighbors::ivf_flat::index<T, IdxT>& idx)                                     \
+  {                                                                                              \
+    cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset, idx);                      \
+  }                                                                                              \
+  auto build(raft::resources const& handle,                                                      \
+             const cuvs::neighbors::ivf_flat::index_params& params,                              \
+             raft::host_matrix_view<const T, IdxT, raft::row_major> dataset)                     \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                  \
+  {                                                                                              \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                            \
+      std::move(cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset)));             \
+  }                                                                                              \
+                                                                                                 \
+  void build(raft::resources const& handle,                                                      \
+             const cuvs::neighbors::ivf_flat::index_params& params,                              \
+             raft::host_matrix_view<const T, IdxT, raft::row_major> dataset,                     \
+             cuvs::neighbors::ivf_flat::index<T, IdxT>& idx)                                     \
+  {                                                                                              \
+    cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset, idx);                      \
+  }                                                                                              \
+  auto extend(raft::resources const& handle,                                                     \
+              raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,              \
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,             \
+              const cuvs::neighbors::ivf_flat::index<T, IdxT>& orig_index)                       \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                  \
+  {                                                                                              \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(std::move(                                  \
+      cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, orig_index))); \
+  }                                                                                              \
+                                                                                                 \
+  void extend(raft::resources const& handle,                                                     \
+              raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,              \
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,             \
+              cuvs::neighbors::ivf_flat::index<T, IdxT>* idx)                                    \
+  {                                                                                              \
+    cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, idx);            \
+  }                                                                                              \
+  auto extend(raft::resources const& handle,                                                     \
+              raft::host_matrix_view<const T, IdxT, raft::row_major> new_vectors,                \
+              std::optional<raft::host_vector_view<const IdxT, IdxT>> new_indices,               \
+              const cuvs::neighbors::ivf_flat::index<T, IdxT>& orig_index)                       \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                  \
+  {                                                                                              \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(std::move(                                  \
+      cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, orig_index))); \
+  }                                                                                              \
+                                                                                                 \
+  void extend(raft::resources const& handle,                                                     \
+              raft::host_matrix_view<const T, IdxT, raft::row_major> new_vectors,                \
+              std::optional<raft::host_vector_view<const IdxT, IdxT>> new_indices,               \
+              cuvs::neighbors::ivf_flat::index<T, IdxT>* idx)                                    \
+  {                                                                                              \
+    cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, idx);            \
+  }
+CUVS_INST_IVF_FLAT_BUILD_EXTEND(float, int64_t);
+
+#undef CUVS_INST_IVF_FLAT_BUILD_EXTEND
+
+}  // namespace cuvs::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_build_extend_int8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat/ivf_flat_build_extend_int8_t_int64_t.cu
new file mode 100644
index 000000000..ef326f241
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_build_extend_int8_t_int64_t.cu
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_ivf_flat.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_ivf_flat.py
+ *
+ */
+
+#include <cuvs/neighbors/ivf_flat.hpp>
+
+#include "ivf_flat_build.cuh"
+
+namespace cuvs::neighbors::ivf_flat {
+
+#define CUVS_INST_IVF_FLAT_BUILD_EXTEND(T, IdxT)                                                 \
+  auto build(raft::resources const& handle,                                                      \
+             const cuvs::neighbors::ivf_flat::index_params& params,                              \
+             raft::device_matrix_view<const T, IdxT, raft::row_major> dataset)                   \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                  \
+  {                                                                                              \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                            \
+      std::move(cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset)));             \
+  }                                                                                              \
+                                                                                                 \
+  void build(raft::resources const& handle,                                                      \
+             const cuvs::neighbors::ivf_flat::index_params& params,                              \
+             raft::device_matrix_view<const T, IdxT, raft::row_major> dataset,                   \
+             cuvs::neighbors::ivf_flat::index<T, IdxT>& idx)                                     \
+  {                                                                                              \
+    cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset, idx);                      \
+  }                                                                                              \
+  auto build(raft::resources const& handle,                                                      \
+             const cuvs::neighbors::ivf_flat::index_params& params,                              \
+             raft::host_matrix_view<const T, IdxT, raft::row_major> dataset)                     \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                  \
+  {                                                                                              \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                            \
+      std::move(cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset)));             \
+  }                                                                                              \
+                                                                                                 \
+  void build(raft::resources const& handle,                                                      \
+             const cuvs::neighbors::ivf_flat::index_params& params,                              \
+             raft::host_matrix_view<const T, IdxT, raft::row_major> dataset,                     \
+             cuvs::neighbors::ivf_flat::index<T, IdxT>& idx)                                     \
+  {                                                                                              \
+    cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset, idx);                      \
+  }                                                                                              \
+  auto extend(raft::resources const& handle,                                                     \
+              raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,              \
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,             \
+              const cuvs::neighbors::ivf_flat::index<T, IdxT>& orig_index)                       \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                  \
+  {                                                                                              \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(std::move(                                  \
+      cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, orig_index))); \
+  }                                                                                              \
+                                                                                                 \
+  void extend(raft::resources const& handle,                                                     \
+              raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,              \
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,             \
+              cuvs::neighbors::ivf_flat::index<T, IdxT>* idx)                                    \
+  {                                                                                              \
+    cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, idx);            \
+  }                                                                                              \
+  auto extend(raft::resources const& handle,                                                     \
+              raft::host_matrix_view<const T, IdxT, raft::row_major> new_vectors,                \
+              std::optional<raft::host_vector_view<const IdxT, IdxT>> new_indices,               \
+              const cuvs::neighbors::ivf_flat::index<T, IdxT>& orig_index)                       \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                  \
+  {                                                                                              \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(std::move(                                  \
+      cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, orig_index))); \
+  }                                                                                              \
+                                                                                                 \
+  void extend(raft::resources const& handle,                                                     \
+              raft::host_matrix_view<const T, IdxT, raft::row_major> new_vectors,                \
+              std::optional<raft::host_vector_view<const IdxT, IdxT>> new_indices,               \
+              cuvs::neighbors::ivf_flat::index<T, IdxT>* idx)                                    \
+  {                                                                                              \
+    cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, idx);            \
+  }
+CUVS_INST_IVF_FLAT_BUILD_EXTEND(int8_t, int64_t);
+
+#undef CUVS_INST_IVF_FLAT_BUILD_EXTEND
+
+}  // namespace cuvs::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_build_extend_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat/ivf_flat_build_extend_uint8_t_int64_t.cu
new file mode 100644
index 000000000..8a1f3e42f
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_build_extend_uint8_t_int64_t.cu
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_ivf_flat.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_ivf_flat.py
+ *
+ */
+
+#include <cuvs/neighbors/ivf_flat.hpp>
+
+#include "ivf_flat_build.cuh"
+
+namespace cuvs::neighbors::ivf_flat {
+
+#define CUVS_INST_IVF_FLAT_BUILD_EXTEND(T, IdxT)                                                 \
+  auto build(raft::resources const& handle,                                                      \
+             const cuvs::neighbors::ivf_flat::index_params& params,                              \
+             raft::device_matrix_view<const T, IdxT, raft::row_major> dataset)                   \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                  \
+  {                                                                                              \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                            \
+      std::move(cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset)));             \
+  }                                                                                              \
+                                                                                                 \
+  void build(raft::resources const& handle,                                                      \
+             const cuvs::neighbors::ivf_flat::index_params& params,                              \
+             raft::device_matrix_view<const T, IdxT, raft::row_major> dataset,                   \
+             cuvs::neighbors::ivf_flat::index<T, IdxT>& idx)                                     \
+  {                                                                                              \
+    cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset, idx);                      \
+  }                                                                                              \
+  auto build(raft::resources const& handle,                                                      \
+             const cuvs::neighbors::ivf_flat::index_params& params,                              \
+             raft::host_matrix_view<const T, IdxT, raft::row_major> dataset)                     \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                  \
+  {                                                                                              \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                            \
+      std::move(cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset)));             \
+  }                                                                                              \
+                                                                                                 \
+  void build(raft::resources const& handle,                                                      \
+             const cuvs::neighbors::ivf_flat::index_params& params,                              \
+             raft::host_matrix_view<const T, IdxT, raft::row_major> dataset,                     \
+             cuvs::neighbors::ivf_flat::index<T, IdxT>& idx)                                     \
+  {                                                                                              \
+    cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset, idx);                      \
+  }                                                                                              \
+  auto extend(raft::resources const& handle,                                                     \
+              raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,              \
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,             \
+              const cuvs::neighbors::ivf_flat::index<T, IdxT>& orig_index)                       \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                  \
+  {                                                                                              \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(std::move(                                  \
+      cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, orig_index))); \
+  }                                                                                              \
+                                                                                                 \
+  void extend(raft::resources const& handle,                                                     \
+              raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,              \
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,             \
+              cuvs::neighbors::ivf_flat::index<T, IdxT>* idx)                                    \
+  {                                                                                              \
+    cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, idx);            \
+  }                                                                                              \
+  auto extend(raft::resources const& handle,                                                     \
+              raft::host_matrix_view<const T, IdxT, raft::row_major> new_vectors,                \
+              std::optional<raft::host_vector_view<const IdxT, IdxT>> new_indices,               \
+              const cuvs::neighbors::ivf_flat::index<T, IdxT>& orig_index)                       \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                  \
+  {                                                                                              \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(std::move(                                  \
+      cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, orig_index))); \
+  }                                                                                              \
+                                                                                                 \
+  void extend(raft::resources const& handle,                                                     \
+              raft::host_matrix_view<const T, IdxT, raft::row_major> new_vectors,                \
+              std::optional<raft::host_vector_view<const IdxT, IdxT>> new_indices,               \
+              cuvs::neighbors::ivf_flat::index<T, IdxT>* idx)                                    \
+  {                                                                                              \
+    cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, idx);            \
+  }
+CUVS_INST_IVF_FLAT_BUILD_EXTEND(uint8_t, int64_t);
+
+#undef CUVS_INST_IVF_FLAT_BUILD_EXTEND
+
+}  // namespace cuvs::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_build_float_int64_t.cpp b/cpp/src/neighbors/ivf_flat/ivf_flat_build_float_int64_t.cpp
deleted file mode 100644
index 177aaac11..000000000
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_build_float_int64_t.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by generate_ivf_flat.py
- *
- * Make changes there and run in this directory:
- *
- * > python generate_ivf_flat.py
- *
- */
-
-#include <cuvs/neighbors/ivf_flat.hpp>
-#include <raft_runtime/neighbors/ivf_flat.hpp>
-
-namespace cuvs::neighbors::ivf_flat {
-
-#define CUVS_INST_IVF_FLAT_BUILD(T, IdxT)                                                      \
-  auto build(raft::resources const& handle,                                                    \
-             const cuvs::neighbors::ivf_flat::index_params& params,                            \
-             raft::device_matrix_view<const T, IdxT, raft::row_major> dataset)                 \
-    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                \
-  {                                                                                            \
-    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                          \
-      std::move(raft::runtime::neighbors::ivf_flat::build(handle, params, dataset)));          \
-  }                                                                                            \
-                                                                                               \
-  void build(raft::resources const& handle,                                                    \
-             const cuvs::neighbors::ivf_flat::index_params& params,                            \
-             raft::device_matrix_view<const T, IdxT, raft::row_major> dataset,                 \
-             cuvs::neighbors::ivf_flat::index<T, IdxT>& idx)                                   \
-  {                                                                                            \
-    raft::runtime::neighbors::ivf_flat::build(handle, params, dataset, *idx.get_raft_index()); \
-  }
-CUVS_INST_IVF_FLAT_BUILD(float, int64_t);
-
-#undef CUVS_INST_IVF_FLAT_BUILD
-
-}  // namespace cuvs::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_build_float_int64_t.cu b/cpp/src/neighbors/ivf_flat/ivf_flat_build_float_int64_t.cu
new file mode 100644
index 000000000..56bb71094
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_build_float_int64_t.cu
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_ivf_flat.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_ivf_flat.py
+ *
+ */
+
+#include <cuvs/neighbors/ivf_flat.hpp>
+
+#include "ivf_flat_build.cuh"
+
+namespace cuvs::neighbors::ivf_flat {
+
+#define CUVS_INST_IVF_FLAT_BUILD(T, IdxT)                                            \
+  auto build(raft::resources const& handle,                                          \
+             const cuvs::neighbors::ivf_flat::index_params& params,                  \
+             raft::device_matrix_view<const T, IdxT, raft::row_major> dataset)       \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                      \
+  {                                                                                  \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                \
+      std::move(cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset))); \
+  }                                                                                  \
+                                                                                     \
+  void build(raft::resources const& handle,                                          \
+             const cuvs::neighbors::ivf_flat::index_params& params,                  \
+             raft::device_matrix_view<const T, IdxT, raft::row_major> dataset,       \
+             cuvs::neighbors::ivf_flat::index<T, IdxT>& idx)                         \
+  {                                                                                  \
+    cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset, idx);          \
+  }                                                                                  \
+  auto build(raft::resources const& handle,                                          \
+             const cuvs::neighbors::ivf_flat::index_params& params,                  \
+             raft::host_matrix_view<const T, IdxT, raft::row_major> dataset)         \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                      \
+  {                                                                                  \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                \
+      std::move(cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset))); \
+  }                                                                                  \
+                                                                                     \
+  void build(raft::resources const& handle,                                          \
+             const cuvs::neighbors::ivf_flat::index_params& params,                  \
+             raft::host_matrix_view<const T, IdxT, raft::row_major> dataset,         \
+             cuvs::neighbors::ivf_flat::index<T, IdxT>& idx)                         \
+  {                                                                                  \
+    cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset, idx);          \
+  }
+CUVS_INST_IVF_FLAT_BUILD(float, int64_t);
+
+#undef CUVS_INST_IVF_FLAT_BUILD
+
+}  // namespace cuvs::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_build_int8_t_int64_t.cpp b/cpp/src/neighbors/ivf_flat/ivf_flat_build_int8_t_int64_t.cpp
deleted file mode 100644
index 6fe6e2b8d..000000000
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_build_int8_t_int64_t.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by generate_ivf_flat.py
- *
- * Make changes there and run in this directory:
- *
- * > python generate_ivf_flat.py
- *
- */
-
-#include <cuvs/neighbors/ivf_flat.hpp>
-#include <raft_runtime/neighbors/ivf_flat.hpp>
-
-namespace cuvs::neighbors::ivf_flat {
-
-#define CUVS_INST_IVF_FLAT_BUILD(T, IdxT)                                                      \
-  auto build(raft::resources const& handle,                                                    \
-             const cuvs::neighbors::ivf_flat::index_params& params,                            \
-             raft::device_matrix_view<const T, IdxT, raft::row_major> dataset)                 \
-    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                \
-  {                                                                                            \
-    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                          \
-      std::move(raft::runtime::neighbors::ivf_flat::build(handle, params, dataset)));          \
-  }                                                                                            \
-                                                                                               \
-  void build(raft::resources const& handle,                                                    \
-             const cuvs::neighbors::ivf_flat::index_params& params,                            \
-             raft::device_matrix_view<const T, IdxT, raft::row_major> dataset,                 \
-             cuvs::neighbors::ivf_flat::index<T, IdxT>& idx)                                   \
-  {                                                                                            \
-    raft::runtime::neighbors::ivf_flat::build(handle, params, dataset, *idx.get_raft_index()); \
-  }
-CUVS_INST_IVF_FLAT_BUILD(int8_t, int64_t);
-
-#undef CUVS_INST_IVF_FLAT_BUILD
-
-}  // namespace cuvs::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_build_int8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat/ivf_flat_build_int8_t_int64_t.cu
new file mode 100644
index 000000000..4803868c0
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_build_int8_t_int64_t.cu
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_ivf_flat.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_ivf_flat.py
+ *
+ */
+
+#include <cuvs/neighbors/ivf_flat.hpp>
+
+#include "ivf_flat_build.cuh"
+
+namespace cuvs::neighbors::ivf_flat {
+
+#define CUVS_INST_IVF_FLAT_BUILD(T, IdxT)                                            \
+  auto build(raft::resources const& handle,                                          \
+             const cuvs::neighbors::ivf_flat::index_params& params,                  \
+             raft::device_matrix_view<const T, IdxT, raft::row_major> dataset)       \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                      \
+  {                                                                                  \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                \
+      std::move(cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset))); \
+  }                                                                                  \
+                                                                                     \
+  void build(raft::resources const& handle,                                          \
+             const cuvs::neighbors::ivf_flat::index_params& params,                  \
+             raft::device_matrix_view<const T, IdxT, raft::row_major> dataset,       \
+             cuvs::neighbors::ivf_flat::index<T, IdxT>& idx)                         \
+  {                                                                                  \
+    cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset, idx);          \
+  }                                                                                  \
+  auto build(raft::resources const& handle,                                          \
+             const cuvs::neighbors::ivf_flat::index_params& params,                  \
+             raft::host_matrix_view<const T, IdxT, raft::row_major> dataset)         \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                      \
+  {                                                                                  \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                \
+      std::move(cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset))); \
+  }                                                                                  \
+                                                                                     \
+  void build(raft::resources const& handle,                                          \
+             const cuvs::neighbors::ivf_flat::index_params& params,                  \
+             raft::host_matrix_view<const T, IdxT, raft::row_major> dataset,         \
+             cuvs::neighbors::ivf_flat::index<T, IdxT>& idx)                         \
+  {                                                                                  \
+    cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset, idx);          \
+  }
+CUVS_INST_IVF_FLAT_BUILD(int8_t, int64_t);
+
+#undef CUVS_INST_IVF_FLAT_BUILD
+
+}  // namespace cuvs::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_build_uint8_t_int64_t.cpp b/cpp/src/neighbors/ivf_flat/ivf_flat_build_uint8_t_int64_t.cpp
deleted file mode 100644
index 01098ed45..000000000
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_build_uint8_t_int64_t.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by generate_ivf_flat.py
- *
- * Make changes there and run in this directory:
- *
- * > python generate_ivf_flat.py
- *
- */
-
-#include <cuvs/neighbors/ivf_flat.hpp>
-#include <raft_runtime/neighbors/ivf_flat.hpp>
-
-namespace cuvs::neighbors::ivf_flat {
-
-#define CUVS_INST_IVF_FLAT_BUILD(T, IdxT)                                                      \
-  auto build(raft::resources const& handle,                                                    \
-             const cuvs::neighbors::ivf_flat::index_params& params,                            \
-             raft::device_matrix_view<const T, IdxT, raft::row_major> dataset)                 \
-    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                \
-  {                                                                                            \
-    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                          \
-      std::move(raft::runtime::neighbors::ivf_flat::build(handle, params, dataset)));          \
-  }                                                                                            \
-                                                                                               \
-  void build(raft::resources const& handle,                                                    \
-             const cuvs::neighbors::ivf_flat::index_params& params,                            \
-             raft::device_matrix_view<const T, IdxT, raft::row_major> dataset,                 \
-             cuvs::neighbors::ivf_flat::index<T, IdxT>& idx)                                   \
-  {                                                                                            \
-    raft::runtime::neighbors::ivf_flat::build(handle, params, dataset, *idx.get_raft_index()); \
-  }
-CUVS_INST_IVF_FLAT_BUILD(uint8_t, int64_t);
-
-#undef CUVS_INST_IVF_FLAT_BUILD
-
-}  // namespace cuvs::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_build_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat/ivf_flat_build_uint8_t_int64_t.cu
new file mode 100644
index 000000000..e087f94c4
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_build_uint8_t_int64_t.cu
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_ivf_flat.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_ivf_flat.py
+ *
+ */
+
+#include <cuvs/neighbors/ivf_flat.hpp>
+
+#include "ivf_flat_build.cuh"
+
+namespace cuvs::neighbors::ivf_flat {
+
+#define CUVS_INST_IVF_FLAT_BUILD(T, IdxT)                                            \
+  auto build(raft::resources const& handle,                                          \
+             const cuvs::neighbors::ivf_flat::index_params& params,                  \
+             raft::device_matrix_view<const T, IdxT, raft::row_major> dataset)       \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                      \
+  {                                                                                  \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                \
+      std::move(cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset))); \
+  }                                                                                  \
+                                                                                     \
+  void build(raft::resources const& handle,                                          \
+             const cuvs::neighbors::ivf_flat::index_params& params,                  \
+             raft::device_matrix_view<const T, IdxT, raft::row_major> dataset,       \
+             cuvs::neighbors::ivf_flat::index<T, IdxT>& idx)                         \
+  {                                                                                  \
+    cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset, idx);          \
+  }                                                                                  \
+  auto build(raft::resources const& handle,                                          \
+             const cuvs::neighbors::ivf_flat::index_params& params,                  \
+             raft::host_matrix_view<const T, IdxT, raft::row_major> dataset)         \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                      \
+  {                                                                                  \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                \
+      std::move(cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset))); \
+  }                                                                                  \
+                                                                                     \
+  void build(raft::resources const& handle,                                          \
+             const cuvs::neighbors::ivf_flat::index_params& params,                  \
+             raft::host_matrix_view<const T, IdxT, raft::row_major> dataset,         \
+             cuvs::neighbors::ivf_flat::index<T, IdxT>& idx)                         \
+  {                                                                                  \
+    cuvs::neighbors::ivf_flat::detail::build(handle, params, dataset, idx);          \
+  }
+CUVS_INST_IVF_FLAT_BUILD(uint8_t, int64_t);
+
+#undef CUVS_INST_IVF_FLAT_BUILD
+
+}  // namespace cuvs::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_extend_float_int64_t.cpp b/cpp/src/neighbors/ivf_flat/ivf_flat_extend_float_int64_t.cpp
deleted file mode 100644
index 04ca3a50f..000000000
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_extend_float_int64_t.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by generate_ivf_flat.py
- *
- * Make changes there and run in this directory:
- *
- * > python generate_ivf_flat.py
- *
- */
-
-#include <cuvs/neighbors/ivf_flat.hpp>
-#include <raft_runtime/neighbors/ivf_flat.hpp>
-
-namespace cuvs::neighbors::ivf_flat {
-
-#define CUVS_INST_IVF_FLAT_EXTEND(T, IdxT)                                           \
-  auto extend(raft::resources const& handle,                                         \
-              raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,  \
-              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-              const cuvs::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \
-    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                      \
-  {                                                                                  \
-    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                \
-      std::move(raft::runtime::neighbors::ivf_flat::extend(                          \
-        handle, new_vectors, new_indices, *orig_index.get_raft_index())));           \
-  }                                                                                  \
-                                                                                     \
-  void extend(raft::resources const& handle,                                         \
-              raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,  \
-              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-              cuvs::neighbors::ivf_flat::index<T, IdxT>* idx)                        \
-  {                                                                                  \
-    raft::runtime::neighbors::ivf_flat::extend(                                      \
-      handle, new_vectors, new_indices, idx->get_raft_index());                      \
-  }
-CUVS_INST_IVF_FLAT_EXTEND(float, int64_t);
-
-#undef CUVS_INST_IVF_FLAT_EXTEND
-
-}  // namespace cuvs::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_extend_float_int64_t.cu b/cpp/src/neighbors/ivf_flat/ivf_flat_extend_float_int64_t.cu
new file mode 100644
index 000000000..2636067bf
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_extend_float_int64_t.cu
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_ivf_flat.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_ivf_flat.py
+ *
+ */
+
+#include <cuvs/neighbors/ivf_flat.hpp>
+
+#include "ivf_flat_build.cuh"
+
+namespace cuvs::neighbors::ivf_flat {
+
+#define CUVS_INST_IVF_FLAT_EXTEND(T, IdxT)                                                       \
+  auto extend(raft::resources const& handle,                                                     \
+              raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,              \
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,             \
+              const cuvs::neighbors::ivf_flat::index<T, IdxT>& orig_index)                       \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                  \
+  {                                                                                              \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(std::move(                                  \
+      cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, orig_index))); \
+  }                                                                                              \
+                                                                                                 \
+  void extend(raft::resources const& handle,                                                     \
+              raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,              \
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,             \
+              cuvs::neighbors::ivf_flat::index<T, IdxT>* idx)                                    \
+  {                                                                                              \
+    cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, idx);            \
+  }                                                                                              \
+  auto extend(raft::resources const& handle,                                                     \
+              raft::host_matrix_view<const T, IdxT, raft::row_major> new_vectors,                \
+              std::optional<raft::host_vector_view<const IdxT, IdxT>> new_indices,               \
+              const cuvs::neighbors::ivf_flat::index<T, IdxT>& orig_index)                       \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                  \
+  {                                                                                              \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(std::move(                                  \
+      cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, orig_index))); \
+  }                                                                                              \
+                                                                                                 \
+  void extend(raft::resources const& handle,                                                     \
+              raft::host_matrix_view<const T, IdxT, raft::row_major> new_vectors,                \
+              std::optional<raft::host_vector_view<const IdxT, IdxT>> new_indices,               \
+              cuvs::neighbors::ivf_flat::index<T, IdxT>* idx)                                    \
+  {                                                                                              \
+    cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, idx);            \
+  }
+CUVS_INST_IVF_FLAT_EXTEND(float, int64_t);
+
+#undef CUVS_INST_IVF_FLAT_EXTEND
+
+}  // namespace cuvs::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_extend_int8_t_int64_t.cpp b/cpp/src/neighbors/ivf_flat/ivf_flat_extend_int8_t_int64_t.cpp
deleted file mode 100644
index accc53e04..000000000
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_extend_int8_t_int64_t.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by generate_ivf_flat.py
- *
- * Make changes there and run in this directory:
- *
- * > python generate_ivf_flat.py
- *
- */
-
-#include <cuvs/neighbors/ivf_flat.hpp>
-#include <raft_runtime/neighbors/ivf_flat.hpp>
-
-namespace cuvs::neighbors::ivf_flat {
-
-#define CUVS_INST_IVF_FLAT_EXTEND(T, IdxT)                                           \
-  auto extend(raft::resources const& handle,                                         \
-              raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,  \
-              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-              const cuvs::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \
-    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                      \
-  {                                                                                  \
-    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                \
-      std::move(raft::runtime::neighbors::ivf_flat::extend(                          \
-        handle, new_vectors, new_indices, *orig_index.get_raft_index())));           \
-  }                                                                                  \
-                                                                                     \
-  void extend(raft::resources const& handle,                                         \
-              raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,  \
-              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-              cuvs::neighbors::ivf_flat::index<T, IdxT>* idx)                        \
-  {                                                                                  \
-    raft::runtime::neighbors::ivf_flat::extend(                                      \
-      handle, new_vectors, new_indices, idx->get_raft_index());                      \
-  }
-CUVS_INST_IVF_FLAT_EXTEND(int8_t, int64_t);
-
-#undef CUVS_INST_IVF_FLAT_EXTEND
-
-}  // namespace cuvs::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_extend_int8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat/ivf_flat_extend_int8_t_int64_t.cu
new file mode 100644
index 000000000..191cb9f39
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_extend_int8_t_int64_t.cu
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_ivf_flat.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_ivf_flat.py
+ *
+ */
+
+#include <cuvs/neighbors/ivf_flat.hpp>
+
+#include "ivf_flat_build.cuh"
+
+namespace cuvs::neighbors::ivf_flat {
+
+#define CUVS_INST_IVF_FLAT_EXTEND(T, IdxT)                                                       \
+  auto extend(raft::resources const& handle,                                                     \
+              raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,              \
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,             \
+              const cuvs::neighbors::ivf_flat::index<T, IdxT>& orig_index)                       \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                  \
+  {                                                                                              \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(std::move(                                  \
+      cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, orig_index))); \
+  }                                                                                              \
+                                                                                                 \
+  void extend(raft::resources const& handle,                                                     \
+              raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,              \
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,             \
+              cuvs::neighbors::ivf_flat::index<T, IdxT>* idx)                                    \
+  {                                                                                              \
+    cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, idx);            \
+  }                                                                                              \
+  auto extend(raft::resources const& handle,                                                     \
+              raft::host_matrix_view<const T, IdxT, raft::row_major> new_vectors,                \
+              std::optional<raft::host_vector_view<const IdxT, IdxT>> new_indices,               \
+              const cuvs::neighbors::ivf_flat::index<T, IdxT>& orig_index)                       \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                  \
+  {                                                                                              \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(std::move(                                  \
+      cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, orig_index))); \
+  }                                                                                              \
+                                                                                                 \
+  void extend(raft::resources const& handle,                                                     \
+              raft::host_matrix_view<const T, IdxT, raft::row_major> new_vectors,                \
+              std::optional<raft::host_vector_view<const IdxT, IdxT>> new_indices,               \
+              cuvs::neighbors::ivf_flat::index<T, IdxT>* idx)                                    \
+  {                                                                                              \
+    cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, idx);            \
+  }
+CUVS_INST_IVF_FLAT_EXTEND(int8_t, int64_t);
+
+#undef CUVS_INST_IVF_FLAT_EXTEND
+
+}  // namespace cuvs::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_extend_uint8_t_int64_t.cpp b/cpp/src/neighbors/ivf_flat/ivf_flat_extend_uint8_t_int64_t.cpp
deleted file mode 100644
index e44ae51b1..000000000
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_extend_uint8_t_int64_t.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by generate_ivf_flat.py
- *
- * Make changes there and run in this directory:
- *
- * > python generate_ivf_flat.py
- *
- */
-
-#include <cuvs/neighbors/ivf_flat.hpp>
-#include <raft_runtime/neighbors/ivf_flat.hpp>
-
-namespace cuvs::neighbors::ivf_flat {
-
-#define CUVS_INST_IVF_FLAT_EXTEND(T, IdxT)                                           \
-  auto extend(raft::resources const& handle,                                         \
-              raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,  \
-              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-              const cuvs::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \
-    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                      \
-  {                                                                                  \
-    return cuvs::neighbors::ivf_flat::index<T, IdxT>(                                \
-      std::move(raft::runtime::neighbors::ivf_flat::extend(                          \
-        handle, new_vectors, new_indices, *orig_index.get_raft_index())));           \
-  }                                                                                  \
-                                                                                     \
-  void extend(raft::resources const& handle,                                         \
-              raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,  \
-              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-              cuvs::neighbors::ivf_flat::index<T, IdxT>* idx)                        \
-  {                                                                                  \
-    raft::runtime::neighbors::ivf_flat::extend(                                      \
-      handle, new_vectors, new_indices, idx->get_raft_index());                      \
-  }
-CUVS_INST_IVF_FLAT_EXTEND(uint8_t, int64_t);
-
-#undef CUVS_INST_IVF_FLAT_EXTEND
-
-}  // namespace cuvs::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_extend_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat/ivf_flat_extend_uint8_t_int64_t.cu
new file mode 100644
index 000000000..29b7e7b69
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_extend_uint8_t_int64_t.cu
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_ivf_flat.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_ivf_flat.py
+ *
+ */
+
+#include <cuvs/neighbors/ivf_flat.hpp>
+
+#include "ivf_flat_build.cuh"
+
+namespace cuvs::neighbors::ivf_flat {
+
+#define CUVS_INST_IVF_FLAT_EXTEND(T, IdxT)                                                       \
+  auto extend(raft::resources const& handle,                                                     \
+              raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,              \
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,             \
+              const cuvs::neighbors::ivf_flat::index<T, IdxT>& orig_index)                       \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                  \
+  {                                                                                              \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(std::move(                                  \
+      cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, orig_index))); \
+  }                                                                                              \
+                                                                                                 \
+  void extend(raft::resources const& handle,                                                     \
+              raft::device_matrix_view<const T, IdxT, raft::row_major> new_vectors,              \
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,             \
+              cuvs::neighbors::ivf_flat::index<T, IdxT>* idx)                                    \
+  {                                                                                              \
+    cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, idx);            \
+  }                                                                                              \
+  auto extend(raft::resources const& handle,                                                     \
+              raft::host_matrix_view<const T, IdxT, raft::row_major> new_vectors,                \
+              std::optional<raft::host_vector_view<const IdxT, IdxT>> new_indices,               \
+              const cuvs::neighbors::ivf_flat::index<T, IdxT>& orig_index)                       \
+    ->cuvs::neighbors::ivf_flat::index<T, IdxT>                                                  \
+  {                                                                                              \
+    return cuvs::neighbors::ivf_flat::index<T, IdxT>(std::move(                                  \
+      cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, orig_index))); \
+  }                                                                                              \
+                                                                                                 \
+  void extend(raft::resources const& handle,                                                     \
+              raft::host_matrix_view<const T, IdxT, raft::row_major> new_vectors,                \
+              std::optional<raft::host_vector_view<const IdxT, IdxT>> new_indices,               \
+              cuvs::neighbors::ivf_flat::index<T, IdxT>* idx)                                    \
+  {                                                                                              \
+    cuvs::neighbors::ivf_flat::detail::extend(handle, new_vectors, new_indices, idx);            \
+  }
+CUVS_INST_IVF_FLAT_EXTEND(uint8_t, int64_t);
+
+#undef CUVS_INST_IVF_FLAT_EXTEND
+
+}  // namespace cuvs::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_helpers.cu b/cpp/src/neighbors/ivf_flat/ivf_flat_helpers.cu
new file mode 100644
index 000000000..be1eb87a0
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_helpers.cu
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+
+#include "ivf_flat_helpers.cuh"
+#include <cuvs/neighbors/ivf_flat.hpp>
+
+namespace cuvs::neighbors::ivf_flat::helpers {
+namespace codepacker {
+namespace detail {
+template <typename T, typename IdxT>
+void pack(
+  raft::resources const& res,
+  raft::device_matrix_view<const T, uint32_t, raft::row_major> codes,
+  uint32_t veclen,
+  uint32_t offset,
+  raft::device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, raft::row_major>
+    list_data)
+{
+  pack_list_data<T, IdxT>(res, codes, veclen, offset, list_data);
+}
+
+template <typename T, typename IdxT>
+void unpack(
+  raft::resources const& res,
+  raft::device_mdspan<const T, typename list_spec<uint32_t, T, IdxT>::list_extents, raft::row_major>
+    list_data,
+  uint32_t veclen,
+  uint32_t offset,
+  raft::device_matrix_view<T, uint32_t, raft::row_major> codes)
+{
+  unpack_list_data<T, IdxT>(res, list_data, veclen, offset, codes);
+}
+
+template <typename T>
+void pack_1(const T* flat_code, T* block, uint32_t dim, uint32_t veclen, uint32_t offset)
+{
+  // The data is written in interleaved groups of `index::kGroupSize` vectors
+  using interleaved_group = cuvs::neighbors::detail::div_utils<kIndexGroupSize>;
+
+  // Interleave dimensions of the source vector while recording it.
+  // NB: such `veclen` is selected, that `dim % veclen == 0`
+  auto group_offset = interleaved_group::roundDown(offset);
+  auto ingroup_id   = interleaved_group::mod(offset) * veclen;
+
+  for (uint32_t l = 0; l < dim; l += veclen) {
+    for (uint32_t j = 0; j < veclen; j++) {
+      block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j] = flat_code[l + j];
+    }
+  }
+}
+
+template <typename T>
+void unpack_1(const T* block, T* flat_code, uint32_t dim, uint32_t veclen, uint32_t offset)
+{
+  // The data is written in interleaved groups of `index::kGroupSize` vectors
+  using interleaved_group = cuvs::neighbors::detail::div_utils<kIndexGroupSize>;
+
+  // NB: such `veclen` is selected, that `dim % veclen == 0`
+  auto group_offset = interleaved_group::roundDown(offset);
+  auto ingroup_id   = interleaved_group::mod(offset) * veclen;
+
+  for (uint32_t l = 0; l < dim; l += veclen) {
+    for (uint32_t j = 0; j < veclen; j++) {
+      flat_code[l + j] = block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j];
+    }
+  }
+}
+}  // namespace detail
+
+void pack(raft::resources const& res,
+          raft::device_matrix_view<const float, uint32_t, raft::row_major> codes,
+          uint32_t veclen,
+          uint32_t offset,
+          raft::device_mdspan<float,
+                              typename list_spec<uint32_t, float, int64_t>::list_extents,
+                              raft::row_major> list_data)
+{
+  detail::pack<float, int64_t>(res, codes, veclen, offset, list_data);
+}
+
+void pack(raft::resources const& res,
+          raft::device_matrix_view<const int8_t, uint32_t, raft::row_major> codes,
+          uint32_t veclen,
+          uint32_t offset,
+          raft::device_mdspan<int8_t,
+                              typename list_spec<uint32_t, int8_t, int64_t>::list_extents,
+                              raft::row_major> list_data)
+{
+  detail::pack<int8_t, int64_t>(res, codes, veclen, offset, list_data);
+}
+
+void pack(raft::resources const& res,
+          raft::device_matrix_view<const uint8_t, uint32_t, raft::row_major> codes,
+          uint32_t veclen,
+          uint32_t offset,
+          raft::device_mdspan<uint8_t,
+                              typename list_spec<uint32_t, uint8_t, int64_t>::list_extents,
+                              raft::row_major> list_data)
+{
+  detail::pack<uint8_t, int64_t>(res, codes, veclen, offset, list_data);
+}
+
+void unpack(raft::resources const& res,
+            raft::device_mdspan<const float,
+                                typename list_spec<uint32_t, float, int64_t>::list_extents,
+                                raft::row_major> list_data,
+            uint32_t veclen,
+            uint32_t offset,
+            raft::device_matrix_view<float, uint32_t, raft::row_major> codes)
+{
+  detail::unpack<float, int64_t>(res, list_data, veclen, offset, codes);
+}
+
+void unpack(raft::resources const& res,
+            raft::device_mdspan<const int8_t,
+                                typename list_spec<uint32_t, int8_t, int64_t>::list_extents,
+                                raft::row_major> list_data,
+            uint32_t veclen,
+            uint32_t offset,
+            raft::device_matrix_view<int8_t, uint32_t, raft::row_major> codes)
+{
+  detail::unpack<int8_t, int64_t>(res, list_data, veclen, offset, codes);
+}
+
+void unpack(raft::resources const& res,
+            raft::device_mdspan<const uint8_t,
+                                typename list_spec<uint32_t, uint8_t, int64_t>::list_extents,
+                                raft::row_major> list_data,
+            uint32_t veclen,
+            uint32_t offset,
+            raft::device_matrix_view<uint8_t, uint32_t, raft::row_major> codes)
+{
+  detail::unpack<uint8_t, int64_t>(res, list_data, veclen, offset, codes);
+}
+
+void pack_1(const float* flat_code, float* block, uint32_t dim, uint32_t veclen, uint32_t offset)
+{
+  detail::pack_1<float>(flat_code, block, dim, veclen, offset);
+}
+
+void pack_1(const int8_t* flat_code, int8_t* block, uint32_t dim, uint32_t veclen, uint32_t offset)
+{
+  detail::pack_1<int8_t>(flat_code, block, dim, veclen, offset);
+}
+
+void pack_1(
+  const uint8_t* flat_code, uint8_t* block, uint32_t dim, uint32_t veclen, uint32_t offset)
+{
+  detail::pack_1<uint8_t>(flat_code, block, dim, veclen, offset);
+}
+
+void unpack_1(const float* block, float* flat_code, uint32_t dim, uint32_t veclen, uint32_t offset)
+{
+  detail::unpack_1<float>(block, flat_code, dim, veclen, offset);
+}
+
+void unpack_1(
+  const int8_t* block, int8_t* flat_code, uint32_t dim, uint32_t veclen, uint32_t offset)
+{
+  detail::unpack_1<int8_t>(block, flat_code, dim, veclen, offset);
+}
+
+void unpack_1(
+  const uint8_t* block, uint8_t* flat_code, uint32_t dim, uint32_t veclen, uint32_t offset)
+{
+  detail::unpack_1<uint8_t>(block, flat_code, dim, veclen, offset);
+}
+
+}  // namespace codepacker
+
+namespace detail {
+
+template <typename T, typename IdxT>
+void reset_index(const raft::resources& res, index<T, IdxT>* idx)
+{
+  auto stream = raft::resource::get_cuda_stream(res);
+
+  cuvs::spatial::knn::detail::utils::memzero(
+    idx->accum_sorted_sizes().data_handle(), idx->accum_sorted_sizes().size(), stream);
+  cuvs::spatial::knn::detail::utils::memzero(
+    idx->list_sizes().data_handle(), idx->list_sizes().size(), stream);
+  cuvs::spatial::knn::detail::utils::memzero(
+    idx->data_ptrs().data_handle(), idx->data_ptrs().size(), stream);
+  cuvs::spatial::knn::detail::utils::memzero(
+    idx->inds_ptrs().data_handle(), idx->inds_ptrs().size(), stream);
+}
+
+}  // namespace detail
+
+void reset_index(const raft::resources& res, index<float, int64_t>* index)
+{
+  detail::reset_index<float, int64_t>(res, index);
+}
+
+/**
+ * @brief Public helper API to reset the data and indices ptrs, and the list sizes. Useful for
+ * externally modifying the index without going through the build stage. The data and indices of the
+ * IVF lists will be lost.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   raft::resources res;
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // initialize an empty index
+ *   ivf_flat::index<int8_t, int64_t> index(res, index_params, D);
+ *   // reset the index's state and list sizes
+ *   ivf_flat::helpers::reset_index(res, &index);
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[inout] index pointer to IVF-Flat index
+ */
+void reset_index(const raft::resources& res, index<int8_t, int64_t>* index)
+{
+  detail::reset_index<int8_t, int64_t>(res, index);
+}
+
+/**
+ * @brief Public helper API to reset the data and indices ptrs, and the list sizes. Useful for
+ * externally modifying the index without going through the build stage. The data and indices of the
+ * IVF lists will be lost.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   raft::resources res;
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // initialize an empty index
+ *   ivf_flat::index<uint8_t, int64_t> index(res, index_params, D);
+ *   // reset the index's state and list sizes
+ *   ivf_flat::helpers::reset_index(res, &index);
+ * @endcode
+ *
+ * @param[in] res raft resource
+ * @param[inout] index pointer to IVF-Flat index
+ */
+void reset_index(const raft::resources& res, index<uint8_t, int64_t>* index)
+{
+  detail::reset_index<uint8_t, int64_t>(res, index);
+}
+
+}  // namespace cuvs::neighbors::ivf_flat::helpers
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_helpers.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_helpers.cuh
new file mode 100644
index 000000000..f76b5e95f
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_helpers.cuh
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuvs/neighbors/ivf_flat.hpp>
+
+#include "../detail/ann_utils.cuh"
+
+#include "../detail/div_utils.cuh"
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <variant>
+
+namespace cuvs::neighbors::ivf_flat::helpers::codepacker {
+
+namespace {
+
+template <typename T>
+__device__ void pack_1(const T* flat_code, T* block, uint32_t dim, uint32_t veclen, uint32_t offset)
+{
+  // The data is written in interleaved groups of `index::kGroupSize` vectors
+  using interleaved_group = cuvs::neighbors::detail::div_utils<kIndexGroupSize>;
+
+  // Interleave dimensions of the source vector while recording it.
+  // NB: such `veclen` is selected, that `dim % veclen == 0`
+  auto group_offset = interleaved_group::roundDown(offset);
+  auto ingroup_id   = interleaved_group::mod(offset) * veclen;
+
+  for (uint32_t l = 0; l < dim; l += veclen) {
+    for (uint32_t j = 0; j < veclen; j++) {
+      block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j] = flat_code[l + j];
+    }
+  }
+}
+
+template <typename T>
+__device__ void unpack_1(
+  const T* block, T* flat_code, uint32_t dim, uint32_t veclen, uint32_t offset)
+{
+  // The data is written in interleaved groups of `index::kGroupSize` vectors
+  using interleaved_group = cuvs::neighbors::detail::div_utils<kIndexGroupSize>;
+
+  // NB: such `veclen` is selected, that `dim % veclen == 0`
+  auto group_offset = interleaved_group::roundDown(offset);
+  auto ingroup_id   = interleaved_group::mod(offset) * veclen;
+
+  for (uint32_t l = 0; l < dim; l += veclen) {
+    for (uint32_t j = 0; j < veclen; j++) {
+      flat_code[l + j] = block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j];
+    }
+  }
+}
+
+template <typename T>
+RAFT_KERNEL pack_interleaved_list_kernel(const T* codes,
+                                         T* list_data,
+                                         uint32_t n_rows,
+                                         uint32_t dim,
+                                         uint32_t veclen,
+                                         std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  uint32_t tid          = blockIdx.x * blockDim.x + threadIdx.x;
+  const uint32_t dst_ix = std::holds_alternative<uint32_t>(offset_or_indices)
+                            ? std::get<uint32_t>(offset_or_indices) + tid
+                            : std::get<const uint32_t*>(offset_or_indices)[tid];
+  if (tid < n_rows) { pack_1(codes + tid * dim, list_data, dim, veclen, dst_ix); }
+}
+
+template <typename T>
+RAFT_KERNEL unpack_interleaved_list_kernel(
+  const T* list_data,
+  T* codes,
+  uint32_t n_rows,
+  uint32_t dim,
+  uint32_t veclen,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  uint32_t tid          = blockIdx.x * blockDim.x + threadIdx.x;
+  const uint32_t src_ix = std::holds_alternative<uint32_t>(offset_or_indices)
+                            ? std::get<uint32_t>(offset_or_indices) + tid
+                            : std::get<const uint32_t*>(offset_or_indices)[tid];
+  if (tid < n_rows) { unpack_1(list_data, codes + tid * dim, dim, veclen, src_ix); }
+}
+
+template <typename T, typename IdxT>
+void pack_list_data(
+  raft::resources const& res,
+  raft::device_matrix_view<const T, uint32_t, raft::row_major> codes,
+  uint32_t veclen,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  raft::device_mdspan<T, typename list_spec<uint32_t, T, IdxT>::list_extents, raft::row_major>
+    list_data)
+{
+  uint32_t n_rows = codes.extent(0);
+  uint32_t dim    = codes.extent(1);
+  if (n_rows == 0 || dim == 0) return;
+  static constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(raft::div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto stream = raft::resource::get_cuda_stream(res);
+  pack_interleaved_list_kernel<<<blocks, threads, 0, stream>>>(
+    codes.data_handle(), list_data.data_handle(), n_rows, dim, veclen, offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename T, typename IdxT>
+void unpack_list_data(
+  raft::resources const& res,
+  raft::device_mdspan<const T, typename list_spec<uint32_t, T, IdxT>::list_extents, raft::row_major>
+    list_data,
+  uint32_t veclen,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  raft::device_matrix_view<T, uint32_t, raft::row_major> codes)
+{
+  uint32_t n_rows = codes.extent(0);
+  uint32_t dim    = codes.extent(1);
+  if (n_rows == 0 || dim == 0) return;
+  static constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(raft::div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto stream = raft::resource::get_cuda_stream(res);
+  unpack_interleaved_list_kernel<<<blocks, threads, 0, stream>>>(
+    list_data.data_handle(), codes.data_handle(), n_rows, dim, veclen, offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+}  // namespace
+}  // namespace cuvs::neighbors::ivf_flat::helpers::codepacker
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh
new file mode 100644
index 000000000..d63059457
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh
@@ -0,0 +1,1201 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../ivf_common.cuh"
+#include "../sample_filter.cuh"
+#include <cuvs/neighbors/ivf_flat.hpp>
+#include <cuvs/neighbors/sample_filter.hpp>
+
+#include <raft/core/logger.hpp>  // RAFT_LOG_TRACE
+#include <raft/core/operators.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+#include <raft/util/cuda_rt_essentials.hpp>  // RAFT_CUDA_TRY
+#include <raft/util/device_loads_stores.cuh>
+#include <raft/util/integer_utils.hpp>
+#include <raft/util/pow2_utils.cuh>
+#include <raft/util/vectorized.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cuvs::neighbors::ivf_flat::detail {
+
+using namespace raft::spatial::knn::detail;  // NOLINT
+
+constexpr int kThreadsPerBlock = 128;
+
+auto RAFT_WEAK_FUNCTION is_local_topk_feasible(uint32_t k) -> bool
+{
+  return k <= raft::matrix::detail::select::warpsort::kMaxCapacity;
+}
+
+/**
+ * @brief Copy `n` elements per block from one place to another.
+ *
+ * @param[out] out target pointer (unique per block)
+ * @param[in] in source pointer
+ * @param n number of elements to copy
+ */
+template <int VecBytes = 16, typename T>
+__device__ inline void copy_vectorized(T* out, const T* in, uint32_t n)
+{
+  constexpr int VecElems = VecBytes / sizeof(T);  // NOLINT
+  using align_bytes      = raft::Pow2<(size_t)VecBytes>;
+  if constexpr (VecElems > 1) {
+    using align_elems = raft::Pow2<VecElems>;
+    if (!align_bytes::areSameAlignOffsets(out, in)) {
+      return copy_vectorized<(VecBytes >> 1), T>(out, in, n);
+    }
+    {  // process unaligned head
+      uint32_t head = align_bytes::roundUp(in) - in;
+      if (head > 0) {
+        copy_vectorized<sizeof(T), T>(out, in, head);
+        n -= head;
+        in += head;
+        out += head;
+      }
+    }
+    {  // process main part vectorized
+      using vec_t = typename raft::IOType<T, VecElems>::Type;
+      copy_vectorized<sizeof(vec_t), vec_t>(
+        reinterpret_cast<vec_t*>(out), reinterpret_cast<const vec_t*>(in), align_elems::div(n));
+    }
+    {  // process unaligned tail
+      uint32_t tail = align_elems::mod(n);
+      if (tail > 0) {
+        n -= tail;
+        copy_vectorized<sizeof(T), T>(out + n, in + n, tail);
+      }
+    }
+  }
+  if constexpr (VecElems <= 1) {
+    for (int i = threadIdx.x; i < n; i += blockDim.x) {
+      out[i] = in[i];
+    }
+  }
+}
+
+/**
+ * @brief Load a part of a vector from the index and from query, compute the (part of the) distance
+ * between them, and aggregate it using the provided Lambda; one structure per thread, per query,
+ * and per index item.
+ *
+ * @tparam kUnroll elements per loop (normally, kUnroll = WarpSize / Veclen)
+ * @tparam Lambda computing the part of the distance for one dimension and aggregating it:
+ *                void (AccT& acc, AccT x, AccT y)
+ * @tparam Veclen size of the vectorized load
+ * @tparam T type of the data in the query and the index
+ * @tparam AccT type of the accumulated value (an optimization for 8bit values to be loaded as 32bit
+ * values)
+ */
+template <int kUnroll, typename Lambda, int Veclen, typename T, typename AccT>
+struct loadAndComputeDist {
+  Lambda compute_dist;
+  AccT& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(AccT& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  /**
+   * Load parts of vectors from the index and query and accumulates the partial distance.
+   * This version assumes the query is stored in shared memory.
+   * Every thread here processes exactly kUnroll * Veclen elements independently of others.
+   */
+  template <typename IdxT>
+  __device__ __forceinline__ void runLoadShmemCompute(const T* const& data,
+                                                      const T* query_shared,
+                                                      IdxT loadIndex,
+                                                      IdxT shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      T encV[Veclen];
+      raft::ldg(encV, data + (loadIndex + j * kIndexGroupSize) * Veclen);
+      T queryRegs[Veclen];
+      raft::lds(queryRegs, &query_shared[shmemIndex + j * Veclen]);
+#pragma unroll
+      for (int k = 0; k < Veclen; ++k) {
+        compute_dist(dist, queryRegs[k], encV[k]);
+      }
+    }
+  }
+
+  /**
+   * Load parts of vectors from the index and query and accumulates the partial distance.
+   * This version assumes the query is stored in the global memory and is different for every
+   * thread. One warp loads exactly WarpSize query elements at once and then reshuffles them into
+   * corresponding threads (`WarpSize / (kUnroll * Veclen)` elements per thread at once).
+   */
+  template <typename IdxT>
+  __device__ __forceinline__ void runLoadShflAndCompute(const T*& data,
+                                                        const T* query,
+                                                        IdxT baseLoadIndex,
+                                                        const int lane_id)
+  {
+    T queryReg               = query[baseLoadIndex + lane_id];
+    constexpr int stride     = kUnroll * Veclen;
+    constexpr int totalIter  = raft::WarpSize / stride;
+    constexpr int gmemStride = stride * kIndexGroupSize;
+#pragma unroll
+    for (int i = 0; i < totalIter; ++i, data += gmemStride) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        T encV[Veclen];
+        raft::ldg(encV, data + (lane_id + j * kIndexGroupSize) * Veclen);
+        const int d = (i * kUnroll + j) * Veclen;
+#pragma unroll
+        for (int k = 0; k < Veclen; ++k) {
+          compute_dist(dist, raft::shfl(queryReg, d + k, raft::WarpSize), encV[k]);
+        }
+      }
+    }
+  }
+
+  /**
+   * Load parts of vectors from the index and query and accumulates the partial distance.
+   * This version augments `runLoadShflAndCompute` when `dim` is not a multiple of `WarpSize`.
+   */
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const T*& data, const T* query, const int lane_id, const int dim, const int dimBlocks)
+  {
+    const int loadDim     = dimBlocks + lane_id;
+    T queryReg            = loadDim < dim ? query[loadDim] : T{0};
+    const int loadDataIdx = lane_id * Veclen;
+    for (int d = 0; d < dim - dimBlocks; d += Veclen, data += kIndexGroupSize * Veclen) {
+      T enc[Veclen];
+      raft::ldg(enc, data + loadDataIdx);
+#pragma unroll
+      for (int k = 0; k < Veclen; k++) {
+        compute_dist(dist, raft::shfl(queryReg, d + k, raft::WarpSize), enc[k]);
+      }
+    }
+  }
+};
+
+// This handles uint8_t 8, 16 Veclens
+template <int kUnroll, typename Lambda, int uint8_veclen>
+struct loadAndComputeDist<kUnroll, Lambda, uint8_veclen, uint8_t, uint32_t> {
+  Lambda compute_dist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+    constexpr int veclen_int = uint8_veclen / 4;  // converting uint8_t veclens to int
+    loadIndex                = loadIndex * veclen_int;
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      uint32_t encV[veclen_int];
+      raft::ldg(
+        encV,
+        reinterpret_cast<unsigned const*>(data) + loadIndex + j * kIndexGroupSize * veclen_int);
+      uint32_t queryRegs[veclen_int];
+      raft::lds(queryRegs,
+                reinterpret_cast<unsigned const*>(query_shared + shmemIndex) + j * veclen_int);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        compute_dist(dist, queryRegs[k], encV[k]);
+      }
+    }
+  }
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    constexpr int veclen_int = uint8_veclen / 4;  // converting uint8_t veclens to int
+    uint32_t queryReg =
+      (lane_id < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int stride = kUnroll * uint8_veclen;
+
+#pragma unroll
+    for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        uint32_t encV[veclen_int];
+        raft::ldg(
+          encV,
+          reinterpret_cast<unsigned const*>(data) + (lane_id + j * kIndexGroupSize) * veclen_int);
+        const int d = (i * kUnroll + j) * veclen_int;
+#pragma unroll
+        for (int k = 0; k < veclen_int; ++k) {
+          compute_dist(dist, raft::shfl(queryReg, d + k, raft::WarpSize), encV[k]);
+        }
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int lane_id,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen_int = uint8_veclen / 4;
+    const int loadDim        = dimBlocks + lane_id * 4;  // Here 4 is for 1 - int
+    uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint32_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks;
+         d += uint8_veclen, data += kIndexGroupSize * uint8_veclen) {
+      uint32_t enc[veclen_int];
+      raft::ldg(enc, reinterpret_cast<uint32_t const*>(data) + lane_id * veclen_int);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        uint32_t q = raft::shfl(queryReg, (d / 4) + k, raft::WarpSize);
+        compute_dist(dist, q, enc[k]);
+      }
+    }
+  }
+};
+
+// Keep this specialized uint8 Veclen = 4, because compiler is generating suboptimal code while
+// using above common template of int2/int4
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 4, uint8_t, uint32_t> {
+  Lambda compute_dist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      uint32_t encV      = reinterpret_cast<unsigned const*>(data)[loadIndex + j * kIndexGroupSize];
+      uint32_t queryRegs = reinterpret_cast<unsigned const*>(query_shared + shmemIndex)[j];
+      compute_dist(dist, queryRegs, encV);
+    }
+  }
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    uint32_t queryReg =
+      (lane_id < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int veclen = 4;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        uint32_t encV = reinterpret_cast<unsigned const*>(data)[lane_id + j * kIndexGroupSize];
+        uint32_t q    = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize);
+        compute_dist(dist, q, encV);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int lane_id,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen = 4;
+    const int loadDim    = dimBlocks + lane_id;
+    uint32_t queryReg    = loadDim < dim ? reinterpret_cast<unsigned const*>(query)[loadDim] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      uint32_t enc = reinterpret_cast<unsigned const*>(data)[lane_id];
+      uint32_t q   = raft::shfl(queryReg, d / veclen, raft::WarpSize);
+      compute_dist(dist, q, enc);
+    }
+  }
+};
+
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 2, uint8_t, uint32_t> {
+  Lambda compute_dist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      uint32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * kIndexGroupSize];
+      uint32_t queryRegs = reinterpret_cast<uint16_t const*>(query_shared + shmemIndex)[j];
+      compute_dist(dist, queryRegs, encV);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    uint32_t queryReg =
+      (lane_id < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int veclen = 2;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        uint32_t encV = reinterpret_cast<uint16_t const*>(data)[lane_id + j * kIndexGroupSize];
+        uint32_t q    = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize);
+        compute_dist(dist, q, encV);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int lane_id,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen = 2;
+    int loadDim          = dimBlocks + lane_id * veclen;
+    uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      uint32_t enc = reinterpret_cast<uint16_t const*>(data)[lane_id];
+      uint32_t q   = raft::shfl(queryReg, d / veclen, raft::WarpSize);
+      compute_dist(dist, q, enc);
+    }
+  }
+};
+
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 1, uint8_t, uint32_t> {
+  Lambda compute_dist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      uint32_t encV      = data[loadIndex + j * kIndexGroupSize];
+      uint32_t queryRegs = query_shared[shmemIndex + j];
+      compute_dist(dist, queryRegs, encV);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    uint32_t queryReg    = query[baseLoadIndex + lane_id];
+    constexpr int veclen = 1;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        uint32_t encV = data[lane_id + j * kIndexGroupSize];
+        uint32_t q    = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize);
+        compute_dist(dist, q, encV);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int lane_id,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen = 1;
+    int loadDim          = dimBlocks + lane_id;
+    uint32_t queryReg    = loadDim < dim ? query[loadDim] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      uint32_t enc = data[lane_id];
+      uint32_t q   = raft::shfl(queryReg, d, raft::WarpSize);
+      compute_dist(dist, q, enc);
+    }
+  }
+};
+
+// This device function is for int8 veclens 4, 8 and 16
+template <int kUnroll, typename Lambda, int int8_veclen>
+struct loadAndComputeDist<kUnroll, Lambda, int8_veclen, int8_t, int32_t> {
+  Lambda compute_dist;
+  int32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
+                                                      const int8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+    constexpr int veclen_int = int8_veclen / 4;  // converting int8_t veclens to int
+
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      int32_t encV[veclen_int];
+      raft::ldg(
+        encV,
+        reinterpret_cast<int32_t const*>(data) + (loadIndex + j * kIndexGroupSize) * veclen_int);
+      int32_t queryRegs[veclen_int];
+      raft::lds(queryRegs,
+                reinterpret_cast<int32_t const*>(query_shared + shmemIndex) + j * veclen_int);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        compute_dist(dist, queryRegs[k], encV[k]);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
+                                                        const int8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    constexpr int veclen_int = int8_veclen / 4;  // converting int8_t veclens to int
+
+    int32_t queryReg =
+      (lane_id < 8) ? reinterpret_cast<int32_t const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int stride = kUnroll * int8_veclen;
+
+#pragma unroll
+    for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        int32_t encV[veclen_int];
+        raft::ldg(
+          encV,
+          reinterpret_cast<int32_t const*>(data) + (lane_id + j * kIndexGroupSize) * veclen_int);
+        const int d = (i * kUnroll + j) * veclen_int;
+#pragma unroll
+        for (int k = 0; k < veclen_int; ++k) {
+          int32_t q = raft::shfl(queryReg, d + k, raft::WarpSize);
+          compute_dist(dist, q, encV[k]);
+        }
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
+  {
+    constexpr int veclen_int = int8_veclen / 4;
+    const int loadDim        = dimBlocks + lane_id * 4;  // Here 4 is for 1 - int;
+    int32_t queryReg = loadDim < dim ? reinterpret_cast<int32_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += int8_veclen, data += kIndexGroupSize * int8_veclen) {
+      int32_t enc[veclen_int];
+      raft::ldg(enc, reinterpret_cast<int32_t const*>(data) + lane_id * veclen_int);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        int32_t q = raft::shfl(queryReg, (d / 4) + k, raft::WarpSize);  // Here 4 is for 1 - int;
+        compute_dist(dist, q, enc[k]);
+      }
+    }
+  }
+};
+
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 2, int8_t, int32_t> {
+  Lambda compute_dist;
+  int32_t& dist;
+  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
+                                                      const int8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      int32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * kIndexGroupSize];
+      int32_t queryRegs = reinterpret_cast<uint16_t const*>(query_shared + shmemIndex)[j];
+      compute_dist(dist, queryRegs, encV);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
+                                                        const int8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    int32_t queryReg =
+      (lane_id < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int veclen = 2;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        int32_t encV = reinterpret_cast<uint16_t const*>(data)[lane_id + j * kIndexGroupSize];
+        int32_t q    = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize);
+        compute_dist(dist, q, encV);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
+  {
+    constexpr int veclen = 2;
+    int loadDim          = dimBlocks + lane_id * veclen;
+    int32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      int32_t enc = reinterpret_cast<uint16_t const*>(data + lane_id * veclen)[0];
+      int32_t q   = raft::shfl(queryReg, d / veclen, raft::WarpSize);
+      compute_dist(dist, q, enc);
+    }
+  }
+};
+
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 1, int8_t, int32_t> {
+  Lambda compute_dist;
+  int32_t& dist;
+  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
+                                                      const int8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      compute_dist(dist, query_shared[shmemIndex + j], data[loadIndex + j * kIndexGroupSize]);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
+                                                        const int8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    constexpr int veclen = 1;
+    constexpr int stride = kUnroll * veclen;
+    int32_t queryReg     = query[baseLoadIndex + lane_id];
+
+#pragma unroll
+    for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        compute_dist(dist,
+                     raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize),
+                     data[lane_id + j * kIndexGroupSize]);
+      }
+    }
+  }
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
+  {
+    constexpr int veclen = 1;
+    const int loadDim    = dimBlocks + lane_id;
+    int32_t queryReg     = loadDim < dim ? query[loadDim] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      compute_dist(dist, raft::shfl(queryReg, d, raft::WarpSize), data[lane_id]);
+    }
+  }
+};
+
+// switch to dummy blocksort when Capacity is 0 this explicit dummy is chosen
+// to support access to warpsort constants like ::queue_t::kDummy
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+struct flat_block_sort {
+  using type = raft::matrix::detail::select::warpsort::block_sort<
+    raft::matrix::detail::select::warpsort::warp_sort_filtered,
+    Capacity,
+    Ascending,
+    T,
+    IdxT>;
+};
+
+template <typename T, bool Ascending, typename IdxT>
+struct flat_block_sort<0, Ascending, T, IdxT>
+  : ivf::detail::dummy_block_sort_t<T, IdxT, Ascending> {
+  using type = ivf::detail::dummy_block_sort_t<T, IdxT, Ascending>;
+};
+
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+using block_sort_t = typename flat_block_sort<Capacity, Ascending, T, IdxT>::type;
+
+/**
+ * Scan clusters for nearest neighbors of the query vectors.
+ * See `ivfflat_interleaved_scan` for more information.
+ *
+ * The clusters are stored in the interleaved index format described in ivf_flat_types.hpp.
+ * For each query vector, a set of clusters is probed: the distance to each vector in the cluster is
+ * calculated, and the top-k nearest neighbors are selected.
+ *
+ * @param compute_dist distance function
+ * @param query_smem_elems number of dimensions of the query vector to fit in a shared memory of a
+ * block; this number must be a multiple of `WarpSize * Veclen`.
+ * @param[in] query a pointer to all queries in a row-major contiguous format [gridDim.y, dim]
+ * @param[in] coarse_index a pointer to the cluster indices to search through [n_probes]
+ * @param[in] list_indices index<T, IdxT>.indices
+ * @param[in] list_data index<T, IdxT>.data
+ * @param[in] list_sizes index<T, IdxT>.list_sizes
+ * @param[in] list_offsets index<T, IdxT>.list_offsets
+ * @param n_probes
+ * @param k
+ * @param dim
+ * @param sample_filter
+ * @param[out] neighbors
+ * @param[out] distances
+ */
+template <int Capacity,
+          int Veclen,
+          bool Ascending,
+          typename T,
+          typename AccT,
+          typename IdxT,
+          typename IvfSampleFilterT,
+          typename Lambda,
+          typename PostLambda>
+RAFT_KERNEL __launch_bounds__(kThreadsPerBlock)
+  interleaved_scan_kernel(Lambda compute_dist,
+                          PostLambda post_process,
+                          const uint32_t query_smem_elems,
+                          const T* query,
+                          const uint32_t* coarse_index,
+                          const T* const* list_data_ptrs,
+                          const uint32_t* list_sizes,
+                          const uint32_t queries_offset,
+                          const uint32_t n_probes,
+                          const uint32_t k,
+                          const uint32_t max_samples,
+                          const uint32_t* chunk_indices,
+                          const uint32_t dim,
+                          IvfSampleFilterT sample_filter,
+                          uint32_t* neighbors,
+                          float* distances)
+{
+  extern __shared__ __align__(256) uint8_t interleaved_scan_kernel_smem[];
+  constexpr bool kManageLocalTopK = Capacity > 0;
+  // Using shared memory for the (part of the) query;
+  // This allows to save on global memory bandwidth when reading index and query
+  // data at the same time.
+  // Its size is `query_smem_elems`.
+  T* query_shared = reinterpret_cast<T*>(interleaved_scan_kernel_smem);
+  // Make the query input and output point to this block's shared query
+  {
+    const int query_id = blockIdx.y;
+    query += query_id * dim;
+    if constexpr (kManageLocalTopK) {
+      neighbors += query_id * k * gridDim.x + blockIdx.x * k;
+      distances += query_id * k * gridDim.x + blockIdx.x * k;
+    } else {
+      distances += query_id * uint64_t(max_samples);
+    }
+    chunk_indices += (n_probes * query_id);
+    coarse_index += query_id * n_probes;
+  }
+
+  // Copy a part of the query into shared memory for faster processing
+  copy_vectorized(query_shared, query, std::min(dim, query_smem_elems));
+  __syncthreads();
+
+  using local_topk_t = block_sort_t<Capacity, Ascending, float, uint32_t>;
+  local_topk_t queue(k);
+  {
+    using align_warp  = raft::Pow2<raft::WarpSize>;
+    const int lane_id = align_warp::mod(threadIdx.x);
+
+    // How many full warps needed to compute the distance (without remainder)
+    const uint32_t full_warps_along_dim = align_warp::roundDown(dim);
+
+    const uint32_t shm_assisted_dim =
+      (dim > query_smem_elems) ? query_smem_elems : full_warps_along_dim;
+
+    // Every CUDA block scans one cluster at a time.
+    for (int probe_id = blockIdx.x; probe_id < n_probes; probe_id += gridDim.x) {
+      const uint32_t list_id = coarse_index[probe_id];  // The id of cluster(list)
+
+      // The number of vectors in each cluster(list); [nlist]
+      const uint32_t list_length = list_sizes[list_id];
+
+      // The number of interleaved groups to be processed
+      const uint32_t num_groups =
+        align_warp::div(list_length + align_warp::Mask);  // ceildiv by power of 2
+
+      uint32_t sample_offset = 0;
+      if (probe_id > 0) { sample_offset = chunk_indices[probe_id - 1]; }
+      assert(list_length == chunk_indices[probe_id] - sample_offset);
+      assert(sample_offset + list_length <= max_samples);
+
+      constexpr int kUnroll        = raft::WarpSize / Veclen;
+      constexpr uint32_t kNumWarps = kThreadsPerBlock / raft::WarpSize;
+      // Every warp reads WarpSize vectors and computes the distances to them.
+      // Then, the distances and corresponding ids are distributed among the threads,
+      // and each thread adds one (id, dist) pair to the filtering queue.
+      for (uint32_t group_id = align_warp::div(threadIdx.x); group_id < num_groups;
+           group_id += kNumWarps) {
+        AccT dist = 0;
+        // This is where this warp begins reading data (start position of an interleaved group)
+        const T* data = list_data_ptrs[list_id] + (group_id * kIndexGroupSize) * dim;
+
+        // This is the vector a given lane/thread handles
+        const uint32_t vec_id = group_id * raft::WarpSize + lane_id;
+        const bool valid =
+          vec_id < list_length && sample_filter(queries_offset + blockIdx.y, list_id, vec_id);
+
+        // Process first shm_assisted_dim dimensions (always using shared memory)
+        if (valid) {
+          loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> lc(dist,
+                                                                                  compute_dist);
+          for (int pos = 0; pos < shm_assisted_dim;
+               pos += raft::WarpSize, data += kIndexGroupSize * raft::WarpSize) {
+            lc.runLoadShmemCompute(data, query_shared, lane_id, pos);
+          }
+        }
+
+        if (dim > query_smem_elems) {
+          // The default path - using shfl ops - for dimensions beyond query_smem_elems
+          loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> lc(dist,
+                                                                                  compute_dist);
+          for (int pos = shm_assisted_dim; pos < full_warps_along_dim; pos += raft::WarpSize) {
+            lc.runLoadShflAndCompute(data, query, pos, lane_id);
+          }
+          lc.runLoadShflAndComputeRemainder(data, query, lane_id, dim, full_warps_along_dim);
+        } else {
+          // when  shm_assisted_dim == full_warps_along_dim < dim
+          if (valid) {
+            loadAndComputeDist<1, decltype(compute_dist), Veclen, T, AccT> lc(dist, compute_dist);
+            for (int pos = full_warps_along_dim; pos < dim;
+                 pos += Veclen, data += kIndexGroupSize * Veclen) {
+              lc.runLoadShmemCompute(data, query_shared, lane_id, pos);
+            }
+          }
+        }
+
+        // Enqueue one element per thread
+        const float val = valid ? static_cast<float>(dist) : local_topk_t::queue_t::kDummy;
+        if constexpr (kManageLocalTopK) {
+          queue.add(val, sample_offset + vec_id);
+        } else {
+          if (vec_id < list_length) distances[sample_offset + vec_id] = val;
+        }
+      }
+
+      // fill up unused slots for current query
+      if constexpr (!kManageLocalTopK) {
+        if (probe_id + 1 == n_probes) {
+          for (uint32_t i = threadIdx.x + sample_offset + list_length; i < max_samples;
+               i += blockDim.x) {
+            distances[i] = local_topk_t::queue_t::kDummy;
+          }
+        }
+      }
+    }
+  }
+
+  // finalize and store selected neighbours
+  if constexpr (kManageLocalTopK) {
+    __syncthreads();
+    queue.done(interleaved_scan_kernel_smem);
+    queue.store(distances, neighbors, post_process);
+  }
+}
+
+/**
+ *  Configure the gridDim.x to maximize GPU occupancy, but reduce the output size
+ */
+template <typename T>
+uint32_t configure_launch_x(uint32_t numQueries, uint32_t n_probes, int32_t sMemSize, T func)
+{
+  int dev_id;
+  RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
+  int num_sms;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
+  int num_blocks_per_sm = 0;
+  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &num_blocks_per_sm, func, kThreadsPerBlock, sMemSize));
+
+  size_t min_grid_size = num_sms * num_blocks_per_sm;
+  size_t min_grid_x    = raft::ceildiv<size_t>(min_grid_size, numQueries);
+  return min_grid_x > n_probes ? n_probes : static_cast<uint32_t>(min_grid_x);
+}
+
+template <int Capacity,
+          int Veclen,
+          bool Ascending,
+          typename T,
+          typename AccT,
+          typename IdxT,
+          typename IvfSampleFilterT,
+          typename Lambda,
+          typename PostLambda>
+void launch_kernel(Lambda lambda,
+                   PostLambda post_process,
+                   const index<T, IdxT>& index,
+                   const T* queries,
+                   const uint32_t* coarse_index,
+                   const uint32_t num_queries,
+                   const uint32_t queries_offset,
+                   const uint32_t n_probes,
+                   const uint32_t k,
+                   const uint32_t max_samples,
+                   const uint32_t* chunk_indices,
+                   IvfSampleFilterT sample_filter,
+                   uint32_t* neighbors,
+                   float* distances,
+                   uint32_t& grid_dim_x,
+                   rmm::cuda_stream_view stream)
+{
+  RAFT_EXPECTS(Veclen == index.veclen(),
+               "Configured Veclen does not match the index interleaving pattern.");
+  constexpr auto kKernel   = interleaved_scan_kernel<Capacity,
+                                                   Veclen,
+                                                   Ascending,
+                                                   T,
+                                                   AccT,
+                                                   IdxT,
+                                                   IvfSampleFilterT,
+                                                   Lambda,
+                                                   PostLambda>;
+  const int max_query_smem = 16384;
+  int query_smem_elems     = std::min<int>(max_query_smem / sizeof(T),
+                                       raft::Pow2<Veclen * raft::WarpSize>::roundUp(index.dim()));
+  int smem_size            = query_smem_elems * sizeof(T);
+
+  if constexpr (Capacity > 0) {
+    constexpr int kSubwarpSize = std::min<int>(Capacity, raft::WarpSize);
+    auto block_merge_mem =
+      raft::matrix::detail::select::warpsort::calc_smem_size_for_block_wide<float, IdxT>(
+        kThreadsPerBlock / kSubwarpSize, k);
+    smem_size += std::max<int>(smem_size, block_merge_mem);
+  }
+
+  // power-of-two less than cuda limit (for better addr alignment)
+  constexpr uint32_t kMaxGridY = 32768;
+
+  if (grid_dim_x == 0) {
+    grid_dim_x = configure_launch_x(std::min(kMaxGridY, num_queries), n_probes, smem_size, kKernel);
+    return;
+  }
+
+  for (uint32_t query_offset = 0; query_offset < num_queries; query_offset += kMaxGridY) {
+    uint32_t grid_dim_y = std::min<uint32_t>(kMaxGridY, num_queries - query_offset);
+    dim3 grid_dim(grid_dim_x, grid_dim_y, 1);
+    dim3 block_dim(kThreadsPerBlock);
+    RAFT_LOG_TRACE(
+      "Launching the ivf-flat interleaved_scan_kernel (%d, %d, 1) x (%d, 1, 1), n_probes = %d, "
+      "smem_size = %d",
+      grid_dim.x,
+      grid_dim.y,
+      block_dim.x,
+      n_probes,
+      smem_size);
+    kKernel<<<grid_dim, block_dim, smem_size, stream>>>(lambda,
+                                                        post_process,
+                                                        query_smem_elems,
+                                                        queries,
+                                                        coarse_index,
+                                                        index.data_ptrs().data_handle(),
+                                                        index.list_sizes().data_handle(),
+                                                        queries_offset + query_offset,
+                                                        n_probes,
+                                                        k,
+                                                        max_samples,
+                                                        chunk_indices,
+                                                        index.dim(),
+                                                        sample_filter,
+                                                        neighbors,
+                                                        distances);
+    queries += grid_dim_y * index.dim();
+    if constexpr (Capacity > 0) {
+      neighbors += grid_dim_y * grid_dim_x * k;
+      distances += grid_dim_y * grid_dim_x * k;
+    } else {
+      distances += grid_dim_y * max_samples;
+    }
+    chunk_indices += grid_dim_y * n_probes;
+    coarse_index += grid_dim_y * n_probes;
+  }
+}
+
+template <int Veclen, typename T, typename AccT>
+struct euclidean_dist {
+  __device__ __forceinline__ void operator()(AccT& acc, AccT x, AccT y)
+  {
+    const auto diff = x - y;
+    acc += diff * diff;
+  }
+};
+
+template <int Veclen>
+struct euclidean_dist<Veclen, uint8_t, uint32_t> {
+  __device__ __forceinline__ void operator()(uint32_t& acc, uint32_t x, uint32_t y)
+  {
+    if constexpr (Veclen > 1) {
+      const auto diff = __vabsdiffu4(x, y);
+      acc             = raft::dp4a(diff, diff, acc);
+    } else {
+      const auto diff = __usad(x, y, 0u);
+      acc += diff * diff;
+    }
+  }
+};
+
+template <int Veclen>
+struct euclidean_dist<Veclen, int8_t, int32_t> {
+  __device__ __forceinline__ void operator()(int32_t& acc, int32_t x, int32_t y)
+  {
+    if constexpr (Veclen > 1) {
+      // Note that we enforce here that the unsigned version of dp4a is used, because the difference
+      // between two int8 numbers can be greater than 127 and therefore represented as a negative
+      // number in int8. Casting from int8 to int32 would yield incorrect results, while casting
+      // from uint8 to uint32 is correct.
+      const auto diff = __vabsdiffs4(x, y);
+      acc             = raft::dp4a(diff, diff, static_cast<uint32_t>(acc));
+    } else {
+      const auto diff = x - y;
+      acc += diff * diff;
+    }
+  }
+};
+
+template <int Veclen, typename T, typename AccT>
+struct inner_prod_dist {
+  __device__ __forceinline__ void operator()(AccT& acc, AccT x, AccT y)
+  {
+    if constexpr (Veclen > 1 && (std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>)) {
+      acc = raft::dp4a(x, y, acc);
+    } else {
+      acc += x * y;
+    }
+  }
+};
+
+/** Select the distance computation function and forward the rest of the arguments. */
+template <int Capacity,
+          int Veclen,
+          bool Ascending,
+          typename T,
+          typename AccT,
+          typename IdxT,
+          typename IvfSampleFilterT,
+          typename... Args>
+void launch_with_fixed_consts(cuvs::distance::DistanceType metric, Args&&... args)
+{
+  switch (metric) {
+    case cuvs::distance::DistanceType::L2Expanded:
+    case cuvs::distance::DistanceType::L2Unexpanded:
+      return launch_kernel<Capacity,
+                           Veclen,
+                           Ascending,
+                           T,
+                           AccT,
+                           IdxT,
+                           IvfSampleFilterT,
+                           euclidean_dist<Veclen, T, AccT>,
+                           raft::identity_op>({}, {}, std::forward<Args>(args)...);
+    case cuvs::distance::DistanceType::L2SqrtExpanded:
+    case cuvs::distance::DistanceType::L2SqrtUnexpanded:
+      return launch_kernel<Capacity,
+                           Veclen,
+                           Ascending,
+                           T,
+                           AccT,
+                           IdxT,
+                           IvfSampleFilterT,
+                           euclidean_dist<Veclen, T, AccT>,
+                           raft::sqrt_op>({}, {}, std::forward<Args>(args)...);
+    case cuvs::distance::DistanceType::InnerProduct:
+      return launch_kernel<Capacity,
+                           Veclen,
+                           Ascending,
+                           T,
+                           AccT,
+                           IdxT,
+                           IvfSampleFilterT,
+                           inner_prod_dist<Veclen, T, AccT>,
+                           raft::identity_op>({}, {}, std::forward<Args>(args)...);
+    // NB: update the description of `knn::ivf_flat::build` when adding here a new metric.
+    default: RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric));
+  }
+}
+
+/**
+ * Lift the `capacity` and `veclen` parameters to the template level,
+ * forward the rest of the arguments unmodified to `launch_interleaved_scan_kernel`.
+ */
+template <typename T,
+          typename AccT,
+          typename IdxT,
+          typename IvfSampleFilterT,
+          int Capacity = raft::matrix::detail::select::warpsort::kMaxCapacity,
+          int Veclen   = std::max<int>(1, 16 / sizeof(T))>
+struct select_interleaved_scan_kernel {
+  /**
+   * Recursively reduce the `Capacity` and `Veclen` parameters until they match the
+   * corresponding runtime arguments.
+   * By default, this recursive process starts with maximum possible values of the
+   * two parameters and ends with both values equal to 1.
+   */
+  template <typename... Args>
+  static inline void run(int k_max, int veclen, bool select_min, Args&&... args)
+  {
+    if constexpr (Capacity > 0) {
+      if (k_max == 0 || k_max > Capacity) {
+        return select_interleaved_scan_kernel<T, AccT, IdxT, IvfSampleFilterT, 0, Veclen>::run(
+          k_max, veclen, select_min, std::forward<Args>(args)...);
+      }
+    }
+    if constexpr (Capacity > 1) {
+      if (k_max * 2 <= Capacity) {
+        return select_interleaved_scan_kernel<T,
+                                              AccT,
+                                              IdxT,
+                                              IvfSampleFilterT,
+                                              Capacity / 2,
+                                              Veclen>::run(k_max,
+                                                           veclen,
+                                                           select_min,
+                                                           std::forward<Args>(args)...);
+      }
+    }
+    if constexpr (Veclen > 1) {
+      if (veclen % Veclen != 0) {
+        return select_interleaved_scan_kernel<T, AccT, IdxT, IvfSampleFilterT, Capacity, 1>::run(
+          k_max, 1, select_min, std::forward<Args>(args)...);
+      }
+    }
+    // NB: this is the limitation of the warpsort structures that use a huge number of
+    //     registers (used in the main kernel here).
+    RAFT_EXPECTS(Capacity == 0 || k_max == Capacity,
+                 "Capacity must be either 0 or a power-of-two not bigger than the maximum "
+                 "allowed size matrix::detail::select::warpsort::kMaxCapacity (%d).",
+                 raft::matrix::detail::select::warpsort::kMaxCapacity);
+    RAFT_EXPECTS(
+      veclen == Veclen,
+      "Veclen must be power-of-two not bigger than the maximum allowed size for this data type.");
+    if (select_min) {
+      launch_with_fixed_consts<Capacity, Veclen, true, T, AccT, IdxT, IvfSampleFilterT>(
+        std::forward<Args>(args)...);
+    } else {
+      launch_with_fixed_consts<Capacity, Veclen, false, T, AccT, IdxT, IvfSampleFilterT>(
+        std::forward<Args>(args)...);
+    }
+  }
+};
+
+/**
+ * @brief Configure and launch an appropriate template instance of the interleaved scan kernel.
+ *
+ * @tparam T value type
+ * @tparam AccT accumulated type
+ * @tparam IdxT type of the indices
+ *
+ * @param index previously built ivf-flat index
+ * @param[in] queries device pointer to the query vectors [batch_size, dim]
+ * @param[in] coarse_query_results device pointer to the cluster (list) ids [batch_size, n_probes]
+ * @param n_queries batch size
+ * @param[in] queries_offset
+ *   An offset of the current query batch. It is used for feeding sample_filter with the
+ *   correct query index.
+ * @param metric type of the measured distance
+ * @param n_probes number of nearest clusters to query
+ * @param k number of nearest neighbors.
+ *            NB: the maximum value of `k` is limited statically by `kMaxCapacity`.
+ * @param select_min whether to select nearest (true) or furthest (false) points w.r.t. the given
+ * metric.
+ * @param[out] neighbors device pointer to the result indices for each query and cluster
+ * [batch_size, grid_dim_x, k]
+ * @param[out] distances device pointer to the result distances for each query and cluster
+ * [batch_size, grid_dim_x, k]
+ * @param[inout] grid_dim_x number of blocks launched across all n_probes clusters;
+ *               (one block processes one or more probes, hence: 1 <= grid_dim_x <= n_probes)
+ * @param stream
+ * @param sample_filter
+ *   A filter that selects samples for a given query. Use an instance of none_ivf_sample_filter to
+ *   provide a green light for every sample.
+ */
+template <typename T, typename AccT, typename IdxT, typename IvfSampleFilterT>
+void ivfflat_interleaved_scan(const index<T, IdxT>& index,
+                              const T* queries,
+                              const uint32_t* coarse_query_results,
+                              const uint32_t n_queries,
+                              const uint32_t queries_offset,
+                              const cuvs::distance::DistanceType metric,
+                              const uint32_t n_probes,
+                              const uint32_t k,
+                              const uint32_t max_samples,
+                              const uint32_t* chunk_indices,
+                              const bool select_min,
+                              IvfSampleFilterT sample_filter,
+                              uint32_t* neighbors,
+                              float* distances,
+                              uint32_t& grid_dim_x,
+                              rmm::cuda_stream_view stream)
+{
+  const int capacity = raft::bound_by_power_of_two(k);
+
+  auto filter_adapter = cuvs::neighbors::filtering::ivf_to_sample_filter(
+    index.inds_ptrs().data_handle(), sample_filter);
+  select_interleaved_scan_kernel<T, AccT, IdxT, decltype(filter_adapter)>::run(capacity,
+                                                                               index.veclen(),
+                                                                               select_min,
+                                                                               metric,
+                                                                               index,
+                                                                               queries,
+                                                                               coarse_query_results,
+                                                                               n_queries,
+                                                                               queries_offset,
+                                                                               n_probes,
+                                                                               k,
+                                                                               max_samples,
+                                                                               chunk_indices,
+                                                                               filter_adapter,
+                                                                               neighbors,
+                                                                               distances,
+                                                                               grid_dim_x,
+                                                                               stream);
+}
+
+}  // namespace cuvs::neighbors::ivf_flat::detail
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_search.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_search.cuh
new file mode 100644
index 000000000..bb106ba2f
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_search.cuh
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../../core/nvtx.hpp"
+#include "../detail/ann_utils.cuh"
+#include "../ivf_common.cuh"                 // cuvs::neighbors::detail::ivf
+#include "ivf_flat_interleaved_scan.cuh"     // interleaved_scan
+#include <cuvs/neighbors/ivf_flat.hpp>       // raft::neighbors::ivf_flat::index
+#include <cuvs/neighbors/sample_filter.hpp>  // none_ivf_sample_filter
+
+#include <cuvs/distance/distance_types.hpp>  // is_min_close, DistanceType
+#include <raft/core/logger.hpp>              // RAFT_LOG_TRACE
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>                // raft::resources
+#include <raft/linalg/gemm.cuh>                   // raft::linalg::gemm
+#include <raft/linalg/norm.cuh>                   // raft::linalg::norm
+#include <raft/linalg/unary_op.cuh>               // raft::linalg::unary_op
+#include <raft/matrix/detail/select_k.cuh>        // matrix::detail::select_k
+#include <raft/spatial/knn/detail/ann_utils.cuh>  // utils::mapping
+
+#include <rmm/resource_ref.hpp>
+
+namespace cuvs::neighbors::ivf_flat::detail {
+
+using namespace cuvs::spatial::knn::detail;  // NOLINT
+
+template <typename T, typename AccT, typename IdxT, typename IvfSampleFilterT>
+void search_impl(raft::resources const& handle,
+                 const cuvs::neighbors::ivf_flat::index<T, IdxT>& index,
+                 const T* queries,
+                 uint32_t n_queries,
+                 uint32_t queries_offset,
+                 uint32_t k,
+                 uint32_t n_probes,
+                 uint32_t max_samples,
+                 bool select_min,
+                 IdxT* neighbors,
+                 AccT* distances,
+                 rmm::device_async_resource_ref search_mr,
+                 IvfSampleFilterT sample_filter)
+{
+  auto stream = raft::resource::get_cuda_stream(handle);
+
+  std::size_t n_queries_probes = std::size_t(n_queries) * std::size_t(n_probes);
+
+  // The norm of query
+  rmm::device_uvector<float> query_norm_dev(n_queries, stream, search_mr);
+  // The distance value of cluster(list) and queries
+  rmm::device_uvector<float> distance_buffer_dev(n_queries * index.n_lists(), stream, search_mr);
+  // The topk distance value of cluster(list) and queries
+  rmm::device_uvector<float> coarse_distances_dev(n_queries_probes, stream, search_mr);
+  // The topk  index of cluster(list) and queries
+  rmm::device_uvector<uint32_t> coarse_indices_dev(n_queries_probes, stream, search_mr);
+
+  // Optional structures if postprocessing is required
+  // The topk distance value of candidate vectors from each cluster(list)
+  rmm::device_uvector<AccT> distances_tmp_dev(0, stream, search_mr);
+  // Number of samples for each query
+  rmm::device_uvector<uint32_t> num_samples(0, stream, search_mr);
+  // Offsets per probe for each query
+  rmm::device_uvector<uint32_t> chunk_index(0, stream, search_mr);
+
+  // The topk index of candidate vectors from each cluster(list), local index offset
+  // also we might need additional storage for select_k
+  rmm::device_uvector<uint32_t> indices_tmp_dev(0, stream, search_mr);
+  rmm::device_uvector<uint32_t> neighbors_uint32_buf(0, stream, search_mr);
+
+  size_t float_query_size;
+  if constexpr (std::is_integral_v<T>) {
+    float_query_size = n_queries * index.dim();
+  } else {
+    float_query_size = 0;
+  }
+  rmm::device_uvector<float> converted_queries_dev(float_query_size, stream, search_mr);
+  float* converted_queries_ptr = converted_queries_dev.data();
+
+  if constexpr (std::is_same_v<T, float>) {
+    converted_queries_ptr = const_cast<float*>(queries);
+  } else {
+    raft::linalg::unaryOp(
+      converted_queries_ptr, queries, n_queries * index.dim(), utils::mapping<float>{}, stream);
+  }
+
+  float alpha = 1.0f;
+  float beta  = 0.0f;
+
+  // todo(lsugy): raft distance? (if performance is similar/better than gemm)
+  switch (index.metric()) {
+    case cuvs::distance::DistanceType::L2Expanded:
+    case cuvs::distance::DistanceType::L2SqrtExpanded: {
+      alpha = -2.0f;
+      beta  = 1.0f;
+      raft::linalg::rowNorm(query_norm_dev.data(),
+                            converted_queries_ptr,
+                            static_cast<IdxT>(index.dim()),
+                            static_cast<IdxT>(n_queries),
+                            raft::linalg::L2Norm,
+                            true,
+                            stream);
+      utils::outer_add(query_norm_dev.data(),
+                       (IdxT)n_queries,
+                       index.center_norms()->data_handle(),
+                       (IdxT)index.n_lists(),
+                       distance_buffer_dev.data(),
+                       stream);
+      RAFT_LOG_TRACE_VEC(index.center_norms()->data_handle(), std::min<uint32_t>(20, index.dim()));
+      RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min<uint32_t>(20, index.n_lists()));
+      break;
+    }
+    default: {
+      alpha = 1.0f;
+      beta  = 0.0f;
+    }
+  }
+
+  raft::linalg::gemm(handle,
+                     true,
+                     false,
+                     index.n_lists(),
+                     n_queries,
+                     index.dim(),
+                     &alpha,
+                     index.centers().data_handle(),
+                     index.dim(),
+                     converted_queries_ptr,
+                     index.dim(),
+                     &beta,
+                     distance_buffer_dev.data(),
+                     index.n_lists(),
+                     stream);
+
+  RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min<uint32_t>(20, index.n_lists()));
+  raft::matrix::detail::select_k<AccT, uint32_t>(handle,
+                                                 distance_buffer_dev.data(),
+                                                 nullptr,
+                                                 n_queries,
+                                                 index.n_lists(),
+                                                 n_probes,
+                                                 coarse_distances_dev.data(),
+                                                 coarse_indices_dev.data(),
+                                                 select_min);
+  RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), n_probes);
+  RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), n_probes);
+
+  uint32_t grid_dim_x = 0;
+  if (n_probes > 1) {
+    // query the gridDimX size to store probes topK output
+    ivfflat_interleaved_scan<T, typename utils::config<T>::value_t, IdxT, IvfSampleFilterT>(
+      index,
+      nullptr,
+      nullptr,
+      n_queries,
+      queries_offset,
+      index.metric(),
+      n_probes,
+      k,
+      0,
+      nullptr,
+      select_min,
+      sample_filter,
+      nullptr,
+      nullptr,
+      grid_dim_x,
+      stream);
+  } else {
+    grid_dim_x = 1;
+  }
+
+  num_samples.resize(n_queries, stream);
+  chunk_index.resize(n_queries_probes, stream);
+
+  ivf::detail::calc_chunk_indices::configure(n_probes, n_queries)(index.list_sizes().data_handle(),
+                                                                  coarse_indices_dev.data(),
+                                                                  chunk_index.data(),
+                                                                  num_samples.data(),
+                                                                  stream);
+
+  auto distances_dev_ptr = distances;
+
+  uint32_t* neighbors_uint32 = nullptr;
+  if constexpr (sizeof(IdxT) == sizeof(uint32_t)) {
+    neighbors_uint32 = reinterpret_cast<uint32_t*>(neighbors);
+  } else {
+    neighbors_uint32_buf.resize(std::size_t(n_queries) * std::size_t(k), stream);
+    neighbors_uint32 = neighbors_uint32_buf.data();
+  }
+
+  uint32_t* indices_dev_ptr = nullptr;
+
+  bool manage_local_topk = is_local_topk_feasible(k);
+  if (!manage_local_topk || grid_dim_x > 1) {
+    auto target_size = std::size_t(n_queries) * (manage_local_topk ? grid_dim_x * k : max_samples);
+
+    distances_tmp_dev.resize(target_size, stream);
+    if (manage_local_topk) indices_tmp_dev.resize(target_size, stream);
+
+    distances_dev_ptr = distances_tmp_dev.data();
+    indices_dev_ptr   = indices_tmp_dev.data();
+  } else {
+    indices_dev_ptr = neighbors_uint32;
+  }
+
+  ivfflat_interleaved_scan<T, typename utils::config<T>::value_t, IdxT, IvfSampleFilterT>(
+    index,
+    queries,
+    coarse_indices_dev.data(),
+    n_queries,
+    queries_offset,
+    index.metric(),
+    n_probes,
+    k,
+    max_samples,
+    chunk_index.data(),
+    select_min,
+    sample_filter,
+    indices_dev_ptr,
+    distances_dev_ptr,
+    grid_dim_x,
+    stream);
+
+  RAFT_LOG_TRACE_VEC(distances_dev_ptr, 2 * k);
+  if (indices_dev_ptr != nullptr) { RAFT_LOG_TRACE_VEC(indices_dev_ptr, 2 * k); }
+
+  // Merge topk values from different blocks
+  if (!manage_local_topk || grid_dim_x > 1) {
+    raft::matrix::detail::select_k<AccT, uint32_t>(
+      handle,
+      distances_tmp_dev.data(),
+      indices_tmp_dev.data(),
+      n_queries,
+      manage_local_topk ? (k * grid_dim_x) : max_samples,
+      k,
+      distances,
+      neighbors_uint32,
+      select_min,
+      false,
+      raft::matrix::SelectAlgo::kAuto,
+      manage_local_topk ? nullptr : num_samples.data());
+  }
+  if (!manage_local_topk) {
+    // post process distances && neighbor IDs
+    ivf::detail::postprocess_distances(
+      distances, distances, index.metric(), n_queries, k, 1.0, false, stream);
+  }
+  ivf::detail::postprocess_neighbors(neighbors,
+                                     neighbors_uint32,
+                                     index.inds_ptrs().data_handle(),
+                                     coarse_indices_dev.data(),
+                                     chunk_index.data(),
+                                     n_queries,
+                                     n_probes,
+                                     k,
+                                     stream);
+}
+
+/** See raft::neighbors::ivf_flat::search docs */
+template <typename T,
+          typename IdxT,
+          typename IvfSampleFilterT = cuvs::neighbors::filtering::none_ivf_sample_filter>
+inline void search_with_filtering(
+  raft::resources const& handle,
+  const search_params& params,
+  const index<T, IdxT>& index,
+  const T* queries,
+  uint32_t n_queries,
+  uint32_t k,
+  IdxT* neighbors,
+  float* distances,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource(),
+  IvfSampleFilterT sample_filter    = IvfSampleFilterT())
+{
+  common::nvtx::range<common::nvtx::domain::cuvs> fun_scope(
+    "ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim());
+
+  RAFT_EXPECTS(params.n_probes > 0,
+               "n_probes (number of clusters to probe in the search) must be positive.");
+  auto n_probes          = std::min<uint32_t>(params.n_probes, index.n_lists());
+  bool manage_local_topk = is_local_topk_feasible(k);
+
+  uint32_t max_samples = 0;
+  if (!manage_local_topk) {
+    IdxT ms = raft::Pow2<128 / sizeof(float)>::roundUp(
+      std::max<IdxT>(index.accum_sorted_sizes()(n_probes), k));
+    RAFT_EXPECTS(ms <= IdxT(std::numeric_limits<uint32_t>::max()),
+                 "The maximum sample size is too big.");
+    max_samples = ms;
+  }
+
+  // a batch size heuristic: try to keep the workspace within the specified size
+  constexpr uint64_t kExpectedWsSize = 1024 * 1024 * 1024;
+  uint64_t max_ws_size =
+    std::min(raft::resource::get_workspace_free_bytes(handle), kExpectedWsSize);
+
+  uint64_t ws_size_per_query = 4ull * (2 * n_probes + index.n_lists() + index.dim() + 1) +
+                               (manage_local_topk ? ((sizeof(IdxT) + 4) * n_probes * k)
+                                                  : (4ull * (max_samples + n_probes + 1)));
+
+  const uint32_t max_queries =
+    std::min<uint32_t>(n_queries, raft::div_rounding_up_safe(max_ws_size, ws_size_per_query));
+
+  for (uint32_t offset_q = 0; offset_q < n_queries; offset_q += max_queries) {
+    uint32_t queries_batch = raft::min(max_queries, n_queries - offset_q);
+
+    search_impl<T, float, IdxT, IvfSampleFilterT>(handle,
+                                                  index,
+                                                  queries + offset_q * index.dim(),
+                                                  queries_batch,
+                                                  offset_q,
+                                                  k,
+                                                  n_probes,
+                                                  max_samples,
+                                                  cuvs::distance::is_min_close(index.metric()),
+                                                  neighbors + offset_q * k,
+                                                  distances + offset_q * k,
+                                                  mr,
+                                                  sample_filter);
+  }
+}
+
+template <typename T, typename IdxT, typename IvfSampleFilterT>
+void search_with_filtering(raft::resources const& handle,
+                           const search_params& params,
+                           const index<T, IdxT>& index,
+                           raft::device_matrix_view<const T, IdxT, raft::row_major> queries,
+                           raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,
+                           raft::device_matrix_view<float, IdxT, raft::row_major> distances,
+                           IvfSampleFilterT sample_filter = IvfSampleFilterT())
+{
+  RAFT_EXPECTS(
+    queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
+    "Number of rows in output neighbors and distances matrices must equal the number of queries.");
+
+  RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1),
+               "Number of columns in output neighbors and distances matrices must be equal");
+
+  RAFT_EXPECTS(queries.extent(1) == index.dim(),
+               "Number of query dimensions should equal number of dimensions in the index.");
+
+  search_with_filtering(handle,
+                        params,
+                        index,
+                        queries.data_handle(),
+                        static_cast<std::uint32_t>(queries.extent(0)),
+                        static_cast<std::uint32_t>(neighbors.extent(1)),
+                        neighbors.data_handle(),
+                        distances.data_handle(),
+                        raft::resource::get_workspace_resource(handle),
+                        sample_filter);
+}
+
+template <typename T, typename IdxT>
+void search(raft::resources const& handle,
+            const search_params& params,
+            const index<T, IdxT>& idx,
+            raft::device_matrix_view<const T, IdxT, raft::row_major> queries,
+            raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,
+            raft::device_matrix_view<float, IdxT, raft::row_major> distances)
+{
+  search_with_filtering(handle,
+                        params,
+                        idx,
+                        queries,
+                        neighbors,
+                        distances,
+                        cuvs::neighbors::filtering::none_ivf_sample_filter());
+}
+
+}  // namespace cuvs::neighbors::ivf_flat::detail
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_search_float_int64_t.cpp b/cpp/src/neighbors/ivf_flat/ivf_flat_search_float_int64_t.cu
similarity index 58%
rename from cpp/src/neighbors/ivf_flat/ivf_flat_search_float_int64_t.cpp
rename to cpp/src/neighbors/ivf_flat/ivf_flat_search_float_int64_t.cu
index 48a584e9e..93e46cbef 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_search_float_int64_t.cpp
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_search_float_int64_t.cu
@@ -24,7 +24,8 @@
  */
 
 #include <cuvs/neighbors/ivf_flat.hpp>
-#include <raft_runtime/neighbors/ivf_flat.hpp>
+
+#include "ivf_flat_search.cuh"
 
 namespace cuvs::neighbors::ivf_flat {
 
@@ -36,8 +37,20 @@ namespace cuvs::neighbors::ivf_flat {
               raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,  \
               raft::device_matrix_view<float, IdxT, raft::row_major> distances) \
   {                                                                             \
-    raft::runtime::neighbors::ivf_flat::search(                                 \
-      handle, params, *index.get_raft_index(), queries, neighbors, distances);  \
+    cuvs::neighbors::ivf_flat::detail::search(                                  \
+      handle, params, index, queries, neighbors, distances);                    \
+  }                                                                             \
+  void search_with_filtering(                                                   \
+    raft::resources const& handle,                                              \
+    const search_params& params,                                                \
+    index<T, IdxT>& idx,                                                        \
+    raft::device_matrix_view<const T, IdxT, raft::row_major> queries,           \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,            \
+    raft::device_matrix_view<float, IdxT, raft::row_major> distances,           \
+    cuvs::neighbors::filtering::bitset_filter<uint32_t, IdxT> sample_filter)    \
+  {                                                                             \
+    cuvs::neighbors::ivf_flat::detail::search_with_filtering(                   \
+      handle, params, idx, queries, neighbors, distances, sample_filter);       \
   }
 CUVS_INST_IVF_FLAT_SEARCH(float, int64_t);
 
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_search_int8_t_int64_t.cpp b/cpp/src/neighbors/ivf_flat/ivf_flat_search_int8_t_int64_t.cu
similarity index 58%
rename from cpp/src/neighbors/ivf_flat/ivf_flat_search_int8_t_int64_t.cpp
rename to cpp/src/neighbors/ivf_flat/ivf_flat_search_int8_t_int64_t.cu
index 5645c18e0..5f75d3d48 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_search_int8_t_int64_t.cpp
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_search_int8_t_int64_t.cu
@@ -24,7 +24,8 @@
  */
 
 #include <cuvs/neighbors/ivf_flat.hpp>
-#include <raft_runtime/neighbors/ivf_flat.hpp>
+
+#include "ivf_flat_search.cuh"
 
 namespace cuvs::neighbors::ivf_flat {
 
@@ -36,8 +37,20 @@ namespace cuvs::neighbors::ivf_flat {
               raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,  \
               raft::device_matrix_view<float, IdxT, raft::row_major> distances) \
   {                                                                             \
-    raft::runtime::neighbors::ivf_flat::search(                                 \
-      handle, params, *index.get_raft_index(), queries, neighbors, distances);  \
+    cuvs::neighbors::ivf_flat::detail::search(                                  \
+      handle, params, index, queries, neighbors, distances);                    \
+  }                                                                             \
+  void search_with_filtering(                                                   \
+    raft::resources const& handle,                                              \
+    const search_params& params,                                                \
+    index<T, IdxT>& idx,                                                        \
+    raft::device_matrix_view<const T, IdxT, raft::row_major> queries,           \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,            \
+    raft::device_matrix_view<float, IdxT, raft::row_major> distances,           \
+    cuvs::neighbors::filtering::bitset_filter<uint32_t, IdxT> sample_filter)    \
+  {                                                                             \
+    cuvs::neighbors::ivf_flat::detail::search_with_filtering(                   \
+      handle, params, idx, queries, neighbors, distances, sample_filter);       \
   }
 CUVS_INST_IVF_FLAT_SEARCH(int8_t, int64_t);
 
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_search_uint8_t_int64_t.cpp b/cpp/src/neighbors/ivf_flat/ivf_flat_search_uint8_t_int64_t.cu
similarity index 58%
rename from cpp/src/neighbors/ivf_flat/ivf_flat_search_uint8_t_int64_t.cpp
rename to cpp/src/neighbors/ivf_flat/ivf_flat_search_uint8_t_int64_t.cu
index ab1bee8b6..a2696dc84 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_search_uint8_t_int64_t.cpp
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_search_uint8_t_int64_t.cu
@@ -24,7 +24,8 @@
  */
 
 #include <cuvs/neighbors/ivf_flat.hpp>
-#include <raft_runtime/neighbors/ivf_flat.hpp>
+
+#include "ivf_flat_search.cuh"
 
 namespace cuvs::neighbors::ivf_flat {
 
@@ -36,8 +37,20 @@ namespace cuvs::neighbors::ivf_flat {
               raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,  \
               raft::device_matrix_view<float, IdxT, raft::row_major> distances) \
   {                                                                             \
-    raft::runtime::neighbors::ivf_flat::search(                                 \
-      handle, params, *index.get_raft_index(), queries, neighbors, distances);  \
+    cuvs::neighbors::ivf_flat::detail::search(                                  \
+      handle, params, index, queries, neighbors, distances);                    \
+  }                                                                             \
+  void search_with_filtering(                                                   \
+    raft::resources const& handle,                                              \
+    const search_params& params,                                                \
+    index<T, IdxT>& idx,                                                        \
+    raft::device_matrix_view<const T, IdxT, raft::row_major> queries,           \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,            \
+    raft::device_matrix_view<float, IdxT, raft::row_major> distances,           \
+    cuvs::neighbors::filtering::bitset_filter<uint32_t, IdxT> sample_filter)    \
+  {                                                                             \
+    cuvs::neighbors::ivf_flat::detail::search_with_filtering(                   \
+      handle, params, idx, queries, neighbors, distances, sample_filter);       \
   }
 CUVS_INST_IVF_FLAT_SEARCH(uint8_t, int64_t);
 
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_serialize.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_serialize.cuh
new file mode 100644
index 000000000..3d3ff49d0
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_serialize.cuh
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../ivf_common.cuh"
+#include "../ivf_list.cuh"
+#include <cuvs/neighbors/ivf_flat.hpp>
+#include <cuvs/neighbors/ivf_list.hpp>
+
+#include <raft/core/detail/mdspan_numpy_serializer.hpp>
+#include <raft/core/mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/serialize.hpp>
+#include <raft/util/pow2_utils.cuh>
+
+#include <fstream>
+
+namespace cuvs::neighbors::ivf_flat::detail {
+
+// Serialization version
+// No backward compatibility yet; that is, can't add additional fields without breaking
+// backward compatibility.
+// TODO(hcho3) Implement next-gen serializer for IVF that allows for expansion in a backward
+//             compatible fashion.
+constexpr int serialization_version = 4;
+
+/**
+ * Save the index to file.
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index_ IVF-Flat index
+ *
+ */
+template <typename T, typename IdxT>
+void serialize(raft::resources const& handle, std::ostream& os, const index<T, IdxT>& index_)
+{
+  RAFT_LOG_DEBUG(
+    "Saving IVF-Flat index, size %zu, dim %u", static_cast<size_t>(index_.size()), index_.dim());
+
+  std::string dtype_string = raft::detail::numpy_serializer::get_numpy_dtype<T>().to_string();
+  dtype_string.resize(4);
+  os << dtype_string;
+
+  serialize_scalar(handle, os, serialization_version);
+  serialize_scalar(handle, os, index_.size());
+  serialize_scalar(handle, os, index_.dim());
+  serialize_scalar(handle, os, index_.n_lists());
+  serialize_scalar(handle, os, index_.metric());
+  serialize_scalar(handle, os, index_.adaptive_centers());
+  serialize_scalar(handle, os, index_.conservative_memory_allocation());
+  serialize_mdspan(handle, os, index_.centers());
+  if (index_.center_norms()) {
+    bool has_norms = true;
+    serialize_scalar(handle, os, has_norms);
+    serialize_mdspan(handle, os, *index_.center_norms());
+  } else {
+    bool has_norms = false;
+    serialize_scalar(handle, os, has_norms);
+  }
+  auto sizes_host = raft::make_host_vector<uint32_t, uint32_t>(index_.list_sizes().extent(0));
+  raft::copy(sizes_host.data_handle(),
+             index_.list_sizes().data_handle(),
+             sizes_host.size(),
+             raft::resource::get_cuda_stream(handle));
+  raft::resource::sync_stream(handle);
+  serialize_mdspan(handle, os, sizes_host.view());
+
+  list_spec<uint32_t, T, IdxT> list_store_spec{index_.dim(), true};
+  for (uint32_t label = 0; label < index_.n_lists(); label++) {
+    ivf::serialize_list(handle,
+                        os,
+                        index_.lists()[label],
+                        list_store_spec,
+                        raft::Pow2<kIndexGroupSize>::roundUp(sizes_host(label)));
+  }
+  raft::resource::sync_stream(handle);
+}
+
+template <typename T, typename IdxT>
+void serialize(raft::resources const& handle,
+               const std::string& filename,
+               const index<T, IdxT>& index_)
+{
+  std::ofstream of(filename, std::ios::out | std::ios::binary);
+  if (!of) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
+
+  detail::serialize(handle, of, index_);
+
+  of.close();
+  if (!of) { RAFT_FAIL("Error writing output %s", filename.c_str()); }
+}
+
+/** Load an index from file.
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the name of the file that stores the index
+ * @param[in] index_ IVF-Flat index
+ *
+ */
+template <typename T, typename IdxT>
+auto deserialize(raft::resources const& handle, std::istream& is) -> index<T, IdxT>
+{
+  char dtype_string[4];
+  is.read(dtype_string, 4);
+
+  auto ver = raft::deserialize_scalar<int>(handle, is);
+  if (ver != serialization_version) {
+    RAFT_FAIL("serialization version mismatch, expected %d, got %d ", serialization_version, ver);
+  }
+  auto n_rows           = raft::deserialize_scalar<IdxT>(handle, is);
+  auto dim              = raft::deserialize_scalar<std::uint32_t>(handle, is);
+  auto n_lists          = raft::deserialize_scalar<std::uint32_t>(handle, is);
+  auto metric           = raft::deserialize_scalar<cuvs::distance::DistanceType>(handle, is);
+  bool adaptive_centers = raft::deserialize_scalar<bool>(handle, is);
+  bool cma              = raft::deserialize_scalar<bool>(handle, is);
+
+  index<T, IdxT> index_ = index<T, IdxT>(handle, metric, n_lists, adaptive_centers, cma, dim);
+
+  deserialize_mdspan(handle, is, index_.centers());
+  bool has_norms = raft::deserialize_scalar<bool>(handle, is);
+  if (has_norms) {
+    index_.allocate_center_norms(handle);
+    if (!index_.center_norms()) {
+      RAFT_FAIL("Error inconsistent center norms");
+    } else {
+      auto center_norms = index_.center_norms().value();
+      deserialize_mdspan(handle, is, center_norms);
+    }
+  }
+  deserialize_mdspan(handle, is, index_.list_sizes());
+
+  list_spec<uint32_t, T, IdxT> list_device_spec{index_.dim(), cma};
+  list_spec<uint32_t, T, IdxT> list_store_spec{index_.dim(), true};
+  for (uint32_t label = 0; label < index_.n_lists(); label++) {
+    ivf::deserialize_list(handle, is, index_.lists()[label], list_store_spec, list_device_spec);
+  }
+  raft::resource::sync_stream(handle);
+
+  ivf::detail::recompute_internal_state(handle, index_);
+
+  return index_;
+}
+
+template <typename T, typename IdxT>
+auto deserialize(raft::resources const& handle, const std::string& filename) -> index<T, IdxT>
+{
+  std::ifstream is(filename, std::ios::in | std::ios::binary);
+
+  if (!is) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
+
+  auto index = detail::deserialize<T, IdxT>(handle, is);
+
+  is.close();
+
+  return index;
+}
+}  // namespace cuvs::neighbors::ivf_flat::detail
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_serialize_float_int64_t.cpp b/cpp/src/neighbors/ivf_flat/ivf_flat_serialize_float_int64_t.cu
similarity index 61%
rename from cpp/src/neighbors/ivf_flat/ivf_flat_serialize_float_int64_t.cpp
rename to cpp/src/neighbors/ivf_flat/ivf_flat_serialize_float_int64_t.cu
index 19a3d72d9..9ab00623a 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_serialize_float_int64_t.cpp
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_serialize_float_int64_t.cu
@@ -24,38 +24,40 @@
  */
 
 #include <cuvs/neighbors/ivf_flat.hpp>
-#include <raft_runtime/neighbors/ivf_flat.hpp>
+
+#include "ivf_flat_serialize.cuh"
 
 namespace cuvs::neighbors::ivf_flat {
 
-#define CUVS_INST_IVF_FLAT_SERIALIZE(T, IdxT)                                                      \
-  void serialize_file(raft::resources const& handle,                                               \
-                      const std::string& filename,                                                 \
-                      const cuvs::neighbors::ivf_flat::index<T, IdxT>& index)                      \
-  {                                                                                                \
-    raft::runtime::neighbors::ivf_flat::serialize_file(handle, filename, *index.get_raft_index()); \
-  }                                                                                                \
-                                                                                                   \
-  void deserialize_file(raft::resources const& handle,                                             \
-                        const std::string& filename,                                               \
-                        cuvs::neighbors::ivf_flat::index<T, IdxT>* index)                          \
-  {                                                                                                \
-    raft::runtime::neighbors::ivf_flat::deserialize_file(                                          \
-      handle, filename, index->get_raft_index());                                                  \
-  }                                                                                                \
-                                                                                                   \
-  void serialize(raft::resources const& handle,                                                    \
-                 std::string& str,                                                                 \
-                 const cuvs::neighbors::ivf_flat::index<T, IdxT>& index)                           \
-  {                                                                                                \
-    raft::runtime::neighbors::ivf_flat::serialize(handle, str, *index.get_raft_index());           \
-  }                                                                                                \
-                                                                                                   \
-  void deserialize(raft::resources const& handle,                                                  \
-                   const std::string& str,                                                         \
-                   cuvs::neighbors::ivf_flat::index<T, IdxT>* index)                               \
-  {                                                                                                \
-    raft::runtime::neighbors::ivf_flat::deserialize(handle, str, index->get_raft_index());         \
+#define CUVS_INST_IVF_FLAT_SERIALIZE(T, IdxT)                                           \
+  void serialize_file(raft::resources const& handle,                                    \
+                      const std::string& filename,                                      \
+                      const cuvs::neighbors::ivf_flat::index<T, IdxT>& index)           \
+  {                                                                                     \
+    cuvs::neighbors::ivf_flat::detail::serialize(handle, filename, index);              \
+  }                                                                                     \
+                                                                                        \
+  void serialize(raft::resources const& handle,                                         \
+                 std::string& str,                                                      \
+                 const cuvs::neighbors::ivf_flat::index<T, IdxT>& index)                \
+  {                                                                                     \
+    std::ostringstream os;                                                              \
+    cuvs::neighbors::ivf_flat::detail::serialize(handle, os, index);                    \
+    str = os.str();                                                                     \
+  }                                                                                     \
+                                                                                        \
+  void deserialize_file(raft::resources const& handle,                                  \
+                        const std::string& filename,                                    \
+                        cuvs::neighbors::ivf_flat::index<T, IdxT>* index)               \
+  {                                                                                     \
+    *index = cuvs::neighbors::ivf_flat::detail::deserialize<T, IdxT>(handle, filename); \
+  }                                                                                     \
+  void deserialize(raft::resources const& handle,                                       \
+                   const std::string& str,                                              \
+                   cuvs::neighbors::ivf_flat::index<T, IdxT>* index)                    \
+  {                                                                                     \
+    std::istringstream is(str);                                                         \
+    *index = cuvs::neighbors::ivf_flat::detail::deserialize<T, IdxT>(handle, is);       \
   }
 CUVS_INST_IVF_FLAT_SERIALIZE(float, int64_t);
 
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_serialize_int8_t_int64_t.cpp b/cpp/src/neighbors/ivf_flat/ivf_flat_serialize_int8_t_int64_t.cu
similarity index 61%
rename from cpp/src/neighbors/ivf_flat/ivf_flat_serialize_int8_t_int64_t.cpp
rename to cpp/src/neighbors/ivf_flat/ivf_flat_serialize_int8_t_int64_t.cu
index f65fe221d..18d6a0287 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_serialize_int8_t_int64_t.cpp
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_serialize_int8_t_int64_t.cu
@@ -24,38 +24,40 @@
  */
 
 #include <cuvs/neighbors/ivf_flat.hpp>
-#include <raft_runtime/neighbors/ivf_flat.hpp>
+
+#include "ivf_flat_serialize.cuh"
 
 namespace cuvs::neighbors::ivf_flat {
 
-#define CUVS_INST_IVF_FLAT_SERIALIZE(T, IdxT)                                                      \
-  void serialize_file(raft::resources const& handle,                                               \
-                      const std::string& filename,                                                 \
-                      const cuvs::neighbors::ivf_flat::index<T, IdxT>& index)                      \
-  {                                                                                                \
-    raft::runtime::neighbors::ivf_flat::serialize_file(handle, filename, *index.get_raft_index()); \
-  }                                                                                                \
-                                                                                                   \
-  void deserialize_file(raft::resources const& handle,                                             \
-                        const std::string& filename,                                               \
-                        cuvs::neighbors::ivf_flat::index<T, IdxT>* index)                          \
-  {                                                                                                \
-    raft::runtime::neighbors::ivf_flat::deserialize_file(                                          \
-      handle, filename, index->get_raft_index());                                                  \
-  }                                                                                                \
-                                                                                                   \
-  void serialize(raft::resources const& handle,                                                    \
-                 std::string& str,                                                                 \
-                 const cuvs::neighbors::ivf_flat::index<T, IdxT>& index)                           \
-  {                                                                                                \
-    raft::runtime::neighbors::ivf_flat::serialize(handle, str, *index.get_raft_index());           \
-  }                                                                                                \
-                                                                                                   \
-  void deserialize(raft::resources const& handle,                                                  \
-                   const std::string& str,                                                         \
-                   cuvs::neighbors::ivf_flat::index<T, IdxT>* index)                               \
-  {                                                                                                \
-    raft::runtime::neighbors::ivf_flat::deserialize(handle, str, index->get_raft_index());         \
+#define CUVS_INST_IVF_FLAT_SERIALIZE(T, IdxT)                                           \
+  void serialize_file(raft::resources const& handle,                                    \
+                      const std::string& filename,                                      \
+                      const cuvs::neighbors::ivf_flat::index<T, IdxT>& index)           \
+  {                                                                                     \
+    cuvs::neighbors::ivf_flat::detail::serialize(handle, filename, index);              \
+  }                                                                                     \
+                                                                                        \
+  void serialize(raft::resources const& handle,                                         \
+                 std::string& str,                                                      \
+                 const cuvs::neighbors::ivf_flat::index<T, IdxT>& index)                \
+  {                                                                                     \
+    std::ostringstream os;                                                              \
+    cuvs::neighbors::ivf_flat::detail::serialize(handle, os, index);                    \
+    str = os.str();                                                                     \
+  }                                                                                     \
+                                                                                        \
+  void deserialize_file(raft::resources const& handle,                                  \
+                        const std::string& filename,                                    \
+                        cuvs::neighbors::ivf_flat::index<T, IdxT>* index)               \
+  {                                                                                     \
+    *index = cuvs::neighbors::ivf_flat::detail::deserialize<T, IdxT>(handle, filename); \
+  }                                                                                     \
+  void deserialize(raft::resources const& handle,                                       \
+                   const std::string& str,                                              \
+                   cuvs::neighbors::ivf_flat::index<T, IdxT>* index)                    \
+  {                                                                                     \
+    std::istringstream is(str);                                                         \
+    *index = cuvs::neighbors::ivf_flat::detail::deserialize<T, IdxT>(handle, is);       \
   }
 CUVS_INST_IVF_FLAT_SERIALIZE(int8_t, int64_t);
 
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_serialize_uint8_t_int64_t.cpp b/cpp/src/neighbors/ivf_flat/ivf_flat_serialize_uint8_t_int64_t.cu
similarity index 61%
rename from cpp/src/neighbors/ivf_flat/ivf_flat_serialize_uint8_t_int64_t.cpp
rename to cpp/src/neighbors/ivf_flat/ivf_flat_serialize_uint8_t_int64_t.cu
index 5f312dc0a..c5ab7d5c1 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_serialize_uint8_t_int64_t.cpp
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_serialize_uint8_t_int64_t.cu
@@ -24,38 +24,40 @@
  */
 
 #include <cuvs/neighbors/ivf_flat.hpp>
-#include <raft_runtime/neighbors/ivf_flat.hpp>
+
+#include "ivf_flat_serialize.cuh"
 
 namespace cuvs::neighbors::ivf_flat {
 
-#define CUVS_INST_IVF_FLAT_SERIALIZE(T, IdxT)                                                      \
-  void serialize_file(raft::resources const& handle,                                               \
-                      const std::string& filename,                                                 \
-                      const cuvs::neighbors::ivf_flat::index<T, IdxT>& index)                      \
-  {                                                                                                \
-    raft::runtime::neighbors::ivf_flat::serialize_file(handle, filename, *index.get_raft_index()); \
-  }                                                                                                \
-                                                                                                   \
-  void deserialize_file(raft::resources const& handle,                                             \
-                        const std::string& filename,                                               \
-                        cuvs::neighbors::ivf_flat::index<T, IdxT>* index)                          \
-  {                                                                                                \
-    raft::runtime::neighbors::ivf_flat::deserialize_file(                                          \
-      handle, filename, index->get_raft_index());                                                  \
-  }                                                                                                \
-                                                                                                   \
-  void serialize(raft::resources const& handle,                                                    \
-                 std::string& str,                                                                 \
-                 const cuvs::neighbors::ivf_flat::index<T, IdxT>& index)                           \
-  {                                                                                                \
-    raft::runtime::neighbors::ivf_flat::serialize(handle, str, *index.get_raft_index());           \
-  }                                                                                                \
-                                                                                                   \
-  void deserialize(raft::resources const& handle,                                                  \
-                   const std::string& str,                                                         \
-                   cuvs::neighbors::ivf_flat::index<T, IdxT>* index)                               \
-  {                                                                                                \
-    raft::runtime::neighbors::ivf_flat::deserialize(handle, str, index->get_raft_index());         \
+#define CUVS_INST_IVF_FLAT_SERIALIZE(T, IdxT)                                           \
+  void serialize_file(raft::resources const& handle,                                    \
+                      const std::string& filename,                                      \
+                      const cuvs::neighbors::ivf_flat::index<T, IdxT>& index)           \
+  {                                                                                     \
+    cuvs::neighbors::ivf_flat::detail::serialize(handle, filename, index);              \
+  }                                                                                     \
+                                                                                        \
+  void serialize(raft::resources const& handle,                                         \
+                 std::string& str,                                                      \
+                 const cuvs::neighbors::ivf_flat::index<T, IdxT>& index)                \
+  {                                                                                     \
+    std::ostringstream os;                                                              \
+    cuvs::neighbors::ivf_flat::detail::serialize(handle, os, index);                    \
+    str = os.str();                                                                     \
+  }                                                                                     \
+                                                                                        \
+  void deserialize_file(raft::resources const& handle,                                  \
+                        const std::string& filename,                                    \
+                        cuvs::neighbors::ivf_flat::index<T, IdxT>* index)               \
+  {                                                                                     \
+    *index = cuvs::neighbors::ivf_flat::detail::deserialize<T, IdxT>(handle, filename); \
+  }                                                                                     \
+  void deserialize(raft::resources const& handle,                                       \
+                   const std::string& str,                                              \
+                   cuvs::neighbors::ivf_flat::index<T, IdxT>* index)                    \
+  {                                                                                     \
+    std::istringstream is(str);                                                         \
+    *index = cuvs::neighbors::ivf_flat::detail::deserialize<T, IdxT>(handle, is);       \
   }
 CUVS_INST_IVF_FLAT_SERIALIZE(uint8_t, int64_t);
 
diff --git a/cpp/src/neighbors/ivf_flat_index.cpp b/cpp/src/neighbors/ivf_flat_index.cpp
index 678bec32a..b2fbbfc12 100644
--- a/cpp/src/neighbors/ivf_flat_index.cpp
+++ b/cpp/src/neighbors/ivf_flat_index.cpp
@@ -20,140 +20,203 @@ namespace cuvs::neighbors::ivf_flat {
 
 template <typename T, typename IdxT>
 index<T, IdxT>::index(raft::resources const& res, const index_params& params, uint32_t dim)
-  : ann::index(),
-    raft_index_(std::make_unique<raft::neighbors::ivf_flat::index<T, IdxT>>(
-      res,
-      static_cast<raft::distance::DistanceType>((int)params.metric),
-      params.n_lists,
-      params.adaptive_centers,
-      params.conservative_memory_allocation,
-      dim))
+  : index(res,
+          params.metric,
+          params.n_lists,
+          params.adaptive_centers,
+          params.conservative_memory_allocation,
+          dim)
 {
 }
 
 template <typename T, typename IdxT>
-index<T, IdxT>::index(raft::neighbors::ivf_flat::index<T, IdxT>&& raft_idx)
+index<T, IdxT>::index(raft::resources const& res,
+                      cuvs::distance::DistanceType metric,
+                      uint32_t n_lists,
+                      bool adaptive_centers,
+                      bool conservative_memory_allocation,
+                      uint32_t dim)
   : ann::index(),
-    raft_index_(std::make_unique<raft::neighbors::ivf_flat::index<T, IdxT>>(std::move(raft_idx)))
+    veclen_(calculate_veclen(dim)),
+    metric_(metric),
+    adaptive_centers_(adaptive_centers),
+    conservative_memory_allocation_{conservative_memory_allocation},
+    centers_(raft::make_device_matrix<float, uint32_t>(res, n_lists, dim)),
+    center_norms_(std::nullopt),
+    lists_{n_lists},
+    list_sizes_{raft::make_device_vector<uint32_t, uint32_t>(res, n_lists)},
+    data_ptrs_{raft::make_device_vector<T*, uint32_t>(res, n_lists)},
+    inds_ptrs_{raft::make_device_vector<IdxT*, uint32_t>(res, n_lists)},
+    accum_sorted_sizes_{raft::make_host_vector<IdxT, uint32_t>(n_lists + 1)}
 {
+  check_consistency();
+  accum_sorted_sizes_(n_lists) = 0;
 }
 
 template <typename T, typename IdxT>
 uint32_t index<T, IdxT>::veclen() const noexcept
 {
-  return raft_index_->veclen();
+  return veclen_;
 }
 
 template <typename T, typename IdxT>
 cuvs::distance::DistanceType index<T, IdxT>::metric() const noexcept
 {
-  return static_cast<cuvs::distance::DistanceType>((int)raft_index_->metric());
+  return metric_;
 }
 
 template <typename T, typename IdxT>
 bool index<T, IdxT>::adaptive_centers() const noexcept
 {
-  return raft_index_->adaptive_centers();
+  return adaptive_centers_;
 }
 
 template <typename T, typename IdxT>
 raft::device_vector_view<uint32_t, uint32_t> index<T, IdxT>::list_sizes() noexcept
 {
-  return raft_index_->list_sizes();
+  return list_sizes_.view();
 }
 
 template <typename T, typename IdxT>
 raft::device_vector_view<const uint32_t, uint32_t> index<T, IdxT>::list_sizes() const noexcept
 {
-  return raft_index_->list_sizes();
+  return list_sizes_.view();
 }
 
 template <typename T, typename IdxT>
 raft::device_matrix_view<float, uint32_t, raft::row_major> index<T, IdxT>::centers() noexcept
 {
-  return raft_index_->centers();
+  return centers_.view();
 }
 
 template <typename T, typename IdxT>
 raft::device_matrix_view<const float, uint32_t, raft::row_major> index<T, IdxT>::centers()
   const noexcept
 {
-  return raft_index_->centers();
+  return centers_.view();
 }
 
 template <typename T, typename IdxT>
 std::optional<raft::device_vector_view<float, uint32_t>> index<T, IdxT>::center_norms() noexcept
 {
-  return raft_index_->center_norms();
+  if (center_norms_.has_value()) {
+    return std::make_optional<raft::device_vector_view<float, uint32_t>>(center_norms_->view());
+  } else {
+    return std::nullopt;
+  }
 }
 
 template <typename T, typename IdxT>
 std::optional<raft::device_vector_view<const float, uint32_t>> index<T, IdxT>::center_norms()
   const noexcept
 {
-  return raft_index_->center_norms();
+  if (center_norms_.has_value()) {
+    return std::make_optional<raft::device_vector_view<const float, uint32_t>>(
+      center_norms_->view());
+  } else {
+    return std::nullopt;
+  }
+}
+
+template <typename T, typename IdxT>
+auto index<T, IdxT>::accum_sorted_sizes() noexcept -> raft::host_vector_view<IdxT, uint32_t>
+{
+  return accum_sorted_sizes_.view();
+}
+
+template <typename T, typename IdxT>
+[[nodiscard]] auto index<T, IdxT>::accum_sorted_sizes() const noexcept
+  -> raft::host_vector_view<const IdxT, uint32_t>
+{
+  return accum_sorted_sizes_.view();
 }
 
 template <typename T, typename IdxT>
 IdxT index<T, IdxT>::size() const noexcept
 {
-  return raft_index_->size();
+  return accum_sorted_sizes()(n_lists());
 }
 
 template <typename T, typename IdxT>
 uint32_t index<T, IdxT>::dim() const noexcept
 {
-  return raft_index_->dim();
+  return centers_.extent(1);
 }
 
 template <typename T, typename IdxT>
 uint32_t index<T, IdxT>::n_lists() const noexcept
 {
-  return raft_index_->n_lists();
+  return lists_.size();
 }
 
 template <typename T, typename IdxT>
 raft::device_vector_view<T*, uint32_t> index<T, IdxT>::data_ptrs() noexcept
 {
-  return raft_index_->data_ptrs();
+  return data_ptrs_.view();
 }
 
 template <typename T, typename IdxT>
 raft::device_vector_view<T* const, uint32_t> index<T, IdxT>::data_ptrs() const noexcept
 {
-  return raft_index_->data_ptrs();
+  return data_ptrs_.view();
 }
 
 template <typename T, typename IdxT>
 raft::device_vector_view<IdxT*, uint32_t> index<T, IdxT>::inds_ptrs() noexcept
 {
-  return raft_index_->inds_ptrs();
+  return inds_ptrs_.view();
 }
 
 template <typename T, typename IdxT>
 raft::device_vector_view<IdxT* const, uint32_t> index<T, IdxT>::inds_ptrs() const noexcept
 {
-  return raft_index_->inds_ptrs();
+  return inds_ptrs_.view();
 }
 
 template <typename T, typename IdxT>
 bool index<T, IdxT>::conservative_memory_allocation() const noexcept
 {
-  return raft_index_->conservative_memory_allocation();
+  return conservative_memory_allocation_;
+}
+
+template <typename T, typename IdxT>
+void index<T, IdxT>::allocate_center_norms(raft::resources const& res)
+{
+  switch (metric_) {
+    case cuvs::distance::DistanceType::L2Expanded:
+    case cuvs::distance::DistanceType::L2SqrtExpanded:
+    case cuvs::distance::DistanceType::L2Unexpanded:
+    case cuvs::distance::DistanceType::L2SqrtUnexpanded:
+      center_norms_ = raft::make_device_vector<float, uint32_t>(res, n_lists());
+      break;
+    default: center_norms_ = std::nullopt;
+  }
+}
+
+template <typename T, typename IdxT>
+std::vector<std::shared_ptr<list_data<T, IdxT>>>& index<T, IdxT>::lists() noexcept
+{
+  return lists_;
 }
 
 template <typename T, typename IdxT>
-std::vector<std::shared_ptr<raft::neighbors::ivf_flat::list_data<T, IdxT>>>&
-index<T, IdxT>::lists() noexcept
+const std::vector<std::shared_ptr<list_data<T, IdxT>>>& index<T, IdxT>::lists() const noexcept
 {
-  return raft_index_->lists();
+  return lists_;
 }
 
 template <typename T, typename IdxT>
-const std::vector<std::shared_ptr<raft::neighbors::ivf_flat::list_data<T, IdxT>>>&
-index<T, IdxT>::lists() const noexcept
+void index<T, IdxT>::check_consistency()
 {
-  return raft_index_->lists();
+  auto n_lists = lists_.size();
+  RAFT_EXPECTS(dim() % veclen_ == 0, "dimensionality is not a multiple of the veclen");
+  RAFT_EXPECTS(list_sizes_.extent(0) == n_lists, "inconsistent list size");
+  RAFT_EXPECTS(data_ptrs_.extent(0) == n_lists, "inconsistent list size");
+  RAFT_EXPECTS(inds_ptrs_.extent(0) == n_lists, "inconsistent list size");
+  RAFT_EXPECTS(                                       //
+    (centers_.extent(0) == list_sizes_.extent(0)) &&  //
+      (!center_norms_.has_value() || centers_.extent(0) == center_norms_->extent(0)),
+    "inconsistent number of lists (clusters)");
 }
 
 template struct index<float, int64_t>;
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_build_common.cu b/cpp/src/neighbors/ivf_pq/ivf_pq_build_common.cu
index 15c965ec9..b08ebb17e 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_build_common.cu
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_build_common.cu
@@ -164,4 +164,4 @@ void erase_list(raft::resources const& res, index<int64_t>* index, uint32_t labe
   detail::erase_list<int64_t>(res, index, label);
 }
 
-}  // namespace cuvs::neighbors::ivf_pq::helpers
\ No newline at end of file
+}  // namespace cuvs::neighbors::ivf_pq::helpers
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index 9b70ee250..dadd450b7 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -19,11 +19,14 @@
 #include "ann_utils.cuh"
 #include "naive_knn.cuh"
 
+#include <cuvs/core/bitset.hpp>
 #include <cuvs/neighbors/ivf_flat.hpp>
 #include <raft/stats/mean.cuh>
 #include <thrust/sequence.h>
 
-#include <raft/spatial/knn/ann.cuh>
+#include <raft/linalg/add.cuh>
+#include <raft/matrix/gather.cuh>
+#include <raft/util/fast_int_div.cuh>
 
 namespace cuvs::neighbors::ivf_flat {
 
@@ -41,7 +44,7 @@ struct AnnIvfFlatInputs {
   IdxT nlist;
   cuvs::distance::DistanceType metric;
   bool adaptive_centers;
-  // bool host_dataset;
+  bool host_dataset;
 };
 
 template <typename IdxT>
@@ -98,47 +101,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
       rmm::device_uvector<T> distances_ivfflat_dev(queries_size, stream_);
       rmm::device_uvector<IdxT> indices_ivfflat_dev(queries_size, stream_);
 
-      {
-        // legacy interface
-        raft::spatial::knn::IVFFlatParam ivfParams;
-        ivfParams.nprobe = ps.nprobe;
-        ivfParams.nlist  = ps.nlist;
-        raft::spatial::knn::knnIndex index;
-
-        raft::spatial::knn::approx_knn_build_index(
-          handle_,
-          &index,
-          dynamic_cast<raft::spatial::knn::knnIndexParam*>(&ivfParams),
-          static_cast<raft::distance::DistanceType>((int)ps.metric),
-          (IdxT)0,
-          database.data(),
-          ps.num_db_vecs,
-          ps.dim);
-
-        raft::resource::sync_stream(handle_);
-        raft::spatial::knn::approx_knn_search(handle_,
-                                              distances_ivfflat_dev.data(),
-                                              indices_ivfflat_dev.data(),
-                                              &index,
-                                              ps.k,
-                                              search_queries.data(),
-                                              ps.num_queries);
-
-        raft::update_host(
-          distances_ivfflat.data(), distances_ivfflat_dev.data(), queries_size, stream_);
-        raft::update_host(
-          indices_ivfflat.data(), indices_ivfflat_dev.data(), queries_size, stream_);
-        raft::resource::sync_stream(handle_);
-      }
-
-      ASSERT_TRUE(eval_neighbours(indices_naive,
-                                  indices_ivfflat,
-                                  distances_naive,
-                                  distances_ivfflat,
-                                  ps.num_queries,
-                                  ps.k,
-                                  0.001,
-                                  min_recall));
       {
         cuvs::neighbors::ivf_flat::index_params index_params;
         cuvs::neighbors::ivf_flat::search_params search_params;
@@ -154,45 +116,42 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
         cuvs::neighbors::ivf_flat::index<DataT, IdxT> idx(handle_, index_params, ps.dim);
         cuvs::neighbors::ivf_flat::index<DataT, IdxT> index_2(handle_, index_params, ps.dim);
 
-        // if (!ps.host_dataset) {
-
-        auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
-          (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
-        idx = cuvs::neighbors::ivf_flat::build(handle_, index_params, database_view);
-        rmm::device_uvector<IdxT> vector_indices(ps.num_db_vecs, stream_);
-        thrust::sequence(raft::resource::get_thrust_policy(handle_),
-                         thrust::device_pointer_cast(vector_indices.data()),
-                         thrust::device_pointer_cast(vector_indices.data() + ps.num_db_vecs));
-        raft::resource::sync_stream(handle_);
-
-        IdxT half_of_data = ps.num_db_vecs / 2;
+        if (!ps.host_dataset) {
+          auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
+            (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
+          idx = cuvs::neighbors::ivf_flat::build(handle_, index_params, database_view);
+          rmm::device_uvector<IdxT> vector_indices(ps.num_db_vecs, stream_);
+          thrust::sequence(raft::resource::get_thrust_policy(handle_),
+                           thrust::device_pointer_cast(vector_indices.data()),
+                           thrust::device_pointer_cast(vector_indices.data() + ps.num_db_vecs));
+          raft::resource::sync_stream(handle_);
 
-        auto half_of_data_view = raft::make_device_matrix_view<const DataT, IdxT>(
-          (const DataT*)database.data(), half_of_data, ps.dim);
+          IdxT half_of_data = ps.num_db_vecs / 2;
 
-        const std::optional<raft::device_vector_view<const IdxT, IdxT>> no_opt = std::nullopt;
-        index_2 = cuvs::neighbors::ivf_flat::extend(handle_, half_of_data_view, no_opt, idx);
+          auto half_of_data_view = raft::make_device_matrix_view<const DataT, IdxT>(
+            (const DataT*)database.data(), half_of_data, ps.dim);
 
-        auto new_half_of_data_view = raft::make_device_matrix_view<const DataT, IdxT>(
-          database.data() + half_of_data * ps.dim, IdxT(ps.num_db_vecs) - half_of_data, ps.dim);
+          const std::optional<raft::device_vector_view<const IdxT, IdxT>> no_opt = std::nullopt;
+          index_2 = cuvs::neighbors::ivf_flat::extend(handle_, half_of_data_view, no_opt, idx);
 
-        auto new_half_of_data_indices_view = raft::make_device_vector_view<const IdxT, IdxT>(
-          vector_indices.data() + half_of_data, IdxT(ps.num_db_vecs) - half_of_data);
+          auto new_half_of_data_view = raft::make_device_matrix_view<const DataT, IdxT>(
+            database.data() + half_of_data * ps.dim, IdxT(ps.num_db_vecs) - half_of_data, ps.dim);
 
-        cuvs::neighbors::ivf_flat::extend(
-          handle_,
-          new_half_of_data_view,
-          std::make_optional<raft::device_vector_view<const IdxT, IdxT>>(
-            new_half_of_data_indices_view),
-          &index_2);
+          auto new_half_of_data_indices_view = raft::make_device_vector_view<const IdxT, IdxT>(
+            vector_indices.data() + half_of_data, IdxT(ps.num_db_vecs) - half_of_data);
 
-        /*
+          cuvs::neighbors::ivf_flat::extend(
+            handle_,
+            new_half_of_data_view,
+            std::make_optional<raft::device_vector_view<const IdxT, IdxT>>(
+              new_half_of_data_indices_view),
+            &index_2);
         } else {
           auto host_database = raft::make_host_matrix<DataT, IdxT>(ps.num_db_vecs, ps.dim);
           raft::copy(
             host_database.data_handle(), database.data(), ps.num_db_vecs * ps.dim, stream_);
-          idx = ivf_flat::build(handle_, index_params,
-        raft::make_const_mdspan(host_database.view()));
+          idx =
+            ivf_flat::build(handle_, index_params, raft::make_const_mdspan(host_database.view()));
 
           auto vector_indices = raft::make_host_vector<IdxT>(handle_, ps.num_db_vecs);
           std::iota(vector_indices.data_handle(), vector_indices.data_handle() + ps.num_db_vecs, 0);
@@ -217,7 +176,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
                              new_half_of_data_indices_view),
                            &index_2);
         }
-        */
 
         auto search_queries_view = raft::make_device_matrix_view<const DataT, IdxT>(
           search_queries.data(), ps.num_queries, ps.dim);
@@ -294,137 +252,137 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
     }
   }
 
-  /*
-    void testPacker()
-    {
-      ivf_flat::index_params index_params;
-      ivf_flat::search_params search_params;
-      index_params.n_lists          = ps.nlist;
-      index_params.metric           = ps.metric;
-      index_params.adaptive_centers = false;
-      search_params.n_probes        = ps.nprobe;
-
-      index_params.add_data_on_build        = false;
-      index_params.kmeans_trainset_fraction = 1.0;
-      index_params.metric_arg               = 0;
-
-      auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
-        (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
-
-      auto idx = ivf_flat::build(handle_, index_params, database_view);
-
-      const std::optional<raft::device_vector_view<const IdxT, IdxT>> no_opt = std::nullopt;
-      index<DataT, IdxT> extend_index = ivf_flat::extend(handle_, database_view, no_opt, idx);
-
-      auto list_sizes = raft::make_host_vector<uint32_t>(idx.n_lists());
-      raft::update_host(list_sizes.data_handle(),
-                  extend_index.list_sizes().data_handle(),
-                  extend_index.n_lists(),
-                  stream_);
-      raft::resource::sync_stream(handle_);
+  void testPacker()
+  {
+    ivf_flat::index_params index_params;
+    ivf_flat::search_params search_params;
+    index_params.n_lists          = ps.nlist;
+    index_params.metric           = ps.metric;
+    index_params.adaptive_centers = false;
+    search_params.n_probes        = ps.nprobe;
+
+    index_params.add_data_on_build        = false;
+    index_params.kmeans_trainset_fraction = 1.0;
+    index_params.metric_arg               = 0;
+
+    auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
+      (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
+
+    auto idx = ivf_flat::build(handle_, index_params, database_view);
+
+    const std::optional<raft::device_vector_view<const IdxT, IdxT>> no_opt = std::nullopt;
+    index<DataT, IdxT> extend_index = ivf_flat::extend(handle_, database_view, no_opt, idx);
+
+    auto list_sizes = raft::make_host_vector<uint32_t>(idx.n_lists());
+    raft::update_host(list_sizes.data_handle(),
+                      extend_index.list_sizes().data_handle(),
+                      extend_index.n_lists(),
+                      stream_);
+    raft::resource::sync_stream(handle_);
 
-      auto& lists = idx.lists();
+    auto& lists = idx.lists();
 
-      // conservative memory allocation for codepacking
-      auto list_device_spec = list_spec<uint32_t, DataT, IdxT>{idx.dim(), false};
+    // conservative memory allocation for codepacking
+    auto list_device_spec = list_spec<uint32_t, DataT, IdxT>{idx.dim(), false};
 
-      for (uint32_t label = 0; label < idx.n_lists(); label++) {
-        uint32_t list_size = list_sizes.data_handle()[label];
+    for (uint32_t label = 0; label < idx.n_lists(); label++) {
+      uint32_t list_size = list_sizes.data_handle()[label];
 
-        ivf::resize_list(handle_, lists[label], list_device_spec, list_size, 0);
-      }
+      ivf::resize_list(handle_, lists[label], list_device_spec, list_size, 0);
+    }
 
-      idx.recompute_internal_state(handle_);
+    idx.recompute_internal_state(handle_);
 
-      using interleaved_group = Pow2<kIndexGroupSize>;
+    using interleaved_group = raft::Pow2<kIndexGroupSize>;
 
-      for (uint32_t label = 0; label < idx.n_lists(); label++) {
-        uint32_t list_size = list_sizes.data_handle()[label];
+    for (uint32_t label = 0; label < idx.n_lists(); label++) {
+      uint32_t list_size = list_sizes.data_handle()[label];
 
-        if (list_size > 0) {
-          uint32_t padded_list_size = interleaved_group::roundUp(list_size);
-          uint32_t n_elems          = padded_list_size * idx.dim();
-          auto list_data            = lists[label]->data;
-          auto list_inds            = extend_index.lists()[label]->indices;
+      if (list_size > 0) {
+        uint32_t padded_list_size = interleaved_group::roundUp(list_size);
+        uint32_t n_elems          = padded_list_size * idx.dim();
+        auto list_data            = lists[label]->data;
+        auto list_inds            = extend_index.lists()[label]->indices;
 
-          // fetch the flat codes
-          auto flat_codes = make_device_matrix<DataT, uint32_t>(handle_, list_size, idx.dim());
+        // fetch the flat codes
+        auto flat_codes = raft::make_device_matrix<DataT, uint32_t>(handle_, list_size, idx.dim());
 
-          matrix::gather(
-            handle_,
-            make_device_matrix_view<const DataT, uint32_t>(
-              (const DataT*)database.data(), static_cast<uint32_t>(ps.num_db_vecs), idx.dim()),
-            make_device_vector_view<const IdxT, uint32_t>((const IdxT*)list_inds.data_handle(),
-                                                          list_size),
-            flat_codes.view());
-
-          helpers::codepacker::pack<DataT, IdxT>(
-            handle_, make_const_mdspan(flat_codes.view()), idx.veclen(), 0, list_data.view());
-
-          {
-            auto mask = make_device_vector<bool>(handle_, n_elems);
-
-            linalg::map_offset(handle_,
-                               mask.view(),
-                               [dim = idx.dim(),
-                                list_size,
-                                padded_list_size,
-                                chunk_size = util::FastIntDiv(idx.veclen())] __device__(auto i) {
-                                 uint32_t max_group_offset =
-    interleaved_group::roundDown(list_size); if (i < max_group_offset * dim) { return true; }
-                                 uint32_t surplus    = (i - max_group_offset * dim);
-                                 uint32_t ingroup_id = interleaved_group::mod(surplus / chunk_size);
-                                 return ingroup_id < (list_size - max_group_offset);
-                               });
-
-            // ensure that the correct number of indices are masked out
-            ASSERT_TRUE(thrust::reduce(raft::resource::get_thrust_policy(handle_),
-                                       mask.data_handle(),
-                                       mask.data_handle() + n_elems,
-                                       0) == list_size * ps.dim);
-
-            auto packed_list_data = make_device_vector<DataT, uint32_t>(handle_, n_elems);
-
-            linalg::map_offset(handle_,
-                               packed_list_data.view(),
-                               [mask      = mask.data_handle(),
-                                list_data = list_data.data_handle()] __device__(uint32_t i) {
-                                 if (mask[i]) return list_data[i];
-                                 return DataT{0};
-                               });
-
-            auto extend_data          = extend_index.lists()[label]->data;
-            auto extend_data_filtered = make_device_vector<DataT, uint32_t>(handle_, n_elems);
-            linalg::map_offset(handle_,
-                               extend_data_filtered.view(),
-                               [mask        = mask.data_handle(),
-                                extend_data = extend_data.data_handle()] __device__(uint32_t i) {
-                                 if (mask[i]) return extend_data[i];
-                                 return DataT{0};
-                               });
-
-            ASSERT_TRUE(cuvs::devArrMatch(packed_list_data.data_handle(),
-                                          extend_data_filtered.data_handle(),
-                                          n_elems,
-                                          cuvs::Compare<DataT>(),
-                                          stream_));
-          }
+        raft::matrix::gather(
+          handle_,
+          raft::make_device_matrix_view<const DataT, uint32_t>(
+            (const DataT*)database.data(), static_cast<uint32_t>(ps.num_db_vecs), idx.dim()),
+          raft::make_device_vector_view<const IdxT, uint32_t>((const IdxT*)list_inds.data_handle(),
+                                                              list_size),
+          flat_codes.view());
 
-          auto unpacked_flat_codes =
-            make_device_matrix<DataT, uint32_t>(handle_, list_size, idx.dim());
+        helpers::codepacker::pack(
+          handle_, make_const_mdspan(flat_codes.view()), idx.veclen(), 0, list_data.view());
 
-          helpers::codepacker::unpack<DataT, IdxT>(
-            handle_, list_data.view(), idx.veclen(), 0, unpacked_flat_codes.view());
+        {
+          auto mask = raft::make_device_vector<bool>(handle_, n_elems);
 
-          ASSERT_TRUE(cuvs::devArrMatch(flat_codes.data_handle(),
-                                        unpacked_flat_codes.data_handle(),
-                                        list_size * ps.dim,
+          raft::linalg::map_offset(
+            handle_,
+            mask.view(),
+            [dim = idx.dim(),
+             list_size,
+             padded_list_size,
+             chunk_size = raft::util::FastIntDiv(idx.veclen())] __device__(auto i) {
+              uint32_t max_group_offset = interleaved_group::roundDown(list_size);
+              if (i < max_group_offset * dim) { return true; }
+              uint32_t surplus    = (i - max_group_offset * dim);
+              uint32_t ingroup_id = interleaved_group::mod(surplus / chunk_size);
+              return ingroup_id < (list_size - max_group_offset);
+            });
+
+          // ensure that the correct number of indices are masked out
+          ASSERT_TRUE(thrust::reduce(raft::resource::get_thrust_policy(handle_),
+                                     mask.data_handle(),
+                                     mask.data_handle() + n_elems,
+                                     0) == list_size * ps.dim);
+
+          auto packed_list_data = raft::make_device_vector<DataT, uint32_t>(handle_, n_elems);
+
+          raft::linalg::map_offset(handle_,
+                                   packed_list_data.view(),
+                                   [mask      = mask.data_handle(),
+                                    list_data = list_data.data_handle()] __device__(uint32_t i) {
+                                     if (mask[i]) return list_data[i];
+                                     return DataT{0};
+                                   });
+
+          auto extend_data          = extend_index.lists()[label]->data;
+          auto extend_data_filtered = raft::make_device_vector<DataT, uint32_t>(handle_, n_elems);
+          raft::linalg::map_offset(
+            handle_,
+            extend_data_filtered.view(),
+            [mask        = mask.data_handle(),
+             extend_data = extend_data.data_handle()] __device__(uint32_t i) {
+              if (mask[i]) return extend_data[i];
+              return DataT{0};
+            });
+
+          ASSERT_TRUE(cuvs::devArrMatch(packed_list_data.data_handle(),
+                                        extend_data_filtered.data_handle(),
+                                        n_elems,
                                         cuvs::Compare<DataT>(),
                                         stream_));
         }
+
+        auto unpacked_flat_codes =
+          raft::make_device_matrix<DataT, uint32_t>(handle_, list_size, idx.dim());
+
+        helpers::codepacker::unpack(
+          handle_, list_data.view(), idx.veclen(), 0, unpacked_flat_codes.view());
+
+        ASSERT_TRUE(cuvs::devArrMatch(flat_codes.data_handle(),
+                                      unpacked_flat_codes.data_handle(),
+                                      list_size * ps.dim,
+                                      cuvs::Compare<DataT>(),
+                                      stream_));
       }
     }
-  */
+  }
 
   void testFilter()
   {
@@ -458,7 +416,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
       raft::resource::sync_stream(handle_);
     }
 
-    /*
     {
       // unless something is really wrong with clustering, this could serve as a lower bound on
       // recall
@@ -494,7 +451,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
                                                      test_ivf_sample_filter::offset));
         raft::resource::sync_stream(handle_);
 
-        raft::core::bitset<std::uint32_t, IdxT> removed_indices_bitset(
+        cuvs::core::bitset<std::uint32_t, IdxT> removed_indices_bitset(
           handle_, removed_indices.view(), ps.num_db_vecs);
 
         // Search with the filter
@@ -507,7 +464,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
           search_queries_view,
           indices_ivfflat_dev.view(),
           distances_ivfflat_dev.view(),
-           cuvs::neighbors::filtering::bitset_filter(removed_indices_bitset.view()));
+          cuvs::neighbors::filtering::bitset_filter(removed_indices_bitset.view()));
 
         raft::update_host(
           distances_ivfflat.data(), distances_ivfflat_dev.data_handle(), queries_size, stream_);
@@ -524,7 +481,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
                                   0.001,
                                   min_recall));
     }
-    */
   }
 
   void SetUp() override
@@ -591,7 +547,6 @@ const std::vector<AnnIvfFlatInputs<int64_t>> inputs = {
   {1000, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::L2Expanded, true},
   {10000, 131072, 8, 10, 20, 1024, cuvs::distance::DistanceType::L2Expanded, false},
 
-  /*
   // host input data
   {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, false, true},
   {1000, 10000, 16, 10, 50, 1024, cuvs::distance::DistanceType::L2Expanded, false, true},
@@ -600,7 +555,6 @@ const std::vector<AnnIvfFlatInputs<int64_t>> inputs = {
   {20, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::L2Expanded, false, true},
   {1000, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::L2Expanded, false, true},
   {10000, 131072, 8, 10, 20, 1024, cuvs::distance::DistanceType::L2Expanded, false, true},
-  */
 
   {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, true},
   {1000, 10000, 16, 10, 50, 1024, cuvs::distance::DistanceType::InnerProduct, true},
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index 5ff7bef61..15b12b118 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -37,6 +37,6 @@ include(../cmake/thirdparty/get_cuvs.cmake)
 # -------------- compile tasks ----------------- #
 add_executable(CAGRA_EXAMPLE src/cagra_example.cu)
 
-# `$<TARGET_NAME_IF_EXISTS:conda_env>` is a generator expression that ensures that targets
-# are installed in a conda environment, if one exists
+# `$<TARGET_NAME_IF_EXISTS:conda_env>` is a generator expression that ensures that targets are
+# installed in a conda environment, if one exists
 target_link_libraries(CAGRA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
diff --git a/rapids_config.cmake b/rapids_config.cmake
index 19906dbc6..4ceefb991 100644
--- a/rapids_config.cmake
+++ b/rapids_config.cmake
@@ -22,13 +22,15 @@ else()
   string(REPLACE "\n" "\n  " _rapids_version_formatted "  ${_rapids_version}")
   message(
     FATAL_ERROR
-      "Could not determine RAPIDS version. Contents of VERSION file:\n${_rapids_version_formatted}")
+      "Could not determine RAPIDS version. Contents of VERSION file:\n${_rapids_version_formatted}"
+  )
 endif()
 
 if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/CUVS_RAPIDS-${RAPIDS_VERSION_MAJOR_MINOR}.cmake")
   file(
     DOWNLOAD
     "https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION_MAJOR_MINOR}/RAPIDS.cmake"
-    "${CMAKE_CURRENT_BINARY_DIR}/CUVS_RAPIDS-${RAPIDS_VERSION_MAJOR_MINOR}.cmake")
+    "${CMAKE_CURRENT_BINARY_DIR}/CUVS_RAPIDS-${RAPIDS_VERSION_MAJOR_MINOR}.cmake"
+  )
 endif()
 include("${CMAKE_CURRENT_BINARY_DIR}/CUVS_RAPIDS-${RAPIDS_VERSION_MAJOR_MINOR}.cmake")