rapidsai · hello-11 · Sep 6, 2023 · Nov 17, 2023 · Nov 17, 2023 · Nov 17, 2023
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -55,6 +55,7 @@ option(BUILD_SHARED_LIBS "Build libwholegraph shared libraries" ON)
 option(CMAKE_CUDA_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler" OFF)
 option(BUILD_TESTS "Configure CMake to build tests" ON)
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA toolkit runtime and libraries" OFF)
+option(BUILD_BENCHMARKS "Configure CMake to build benchmark" ON)
 
 ##############################################################################
 # - Set options based on user defined one  -----------------------------------
@@ -203,6 +204,11 @@ if(BUILD_TESTS AND CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
     add_subdirectory(tests)
 endif()
 
+# optionally build benchmark
+if (BUILD_BENCHMARKS AND CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) 
+    add_subdirectory(bench)
+endif()
+
 ##############################################################################
 # - code checker -------------------------------------------------------------
 

diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
@@ -0,0 +1,70 @@
+#=============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+# option(BUILD_BENCHMARKS "Build wholegraph C++ benchmark tests" ON)
+message(VERBOSE "WHOLEGRAPH: Building wholegraph C++ benchmarks: ${BUILD_BENCHMARKS}")
+
+function(ConfigureBench)
+
+  set(options OPTIONAL)
+  set(oneValueArgs NAME)
+  set(multiValueArgs PATH TARGETS CONFIGURATIONS)
+  cmake_parse_arguments(ConfigureBench "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  set(BENCH_NAME ${ConfigureBench_NAME})
+
+  add_executable(${BENCH_NAME} ${ConfigureBench_PATH})
+
+  target_include_directories(${BENCH_NAME} PRIVATE "$<BUILD_INTERFACE:${WHOLEGRAPH_SOURCE_DIR}>/src")
+  target_link_libraries(
+    ${BENCH_NAME}
+    PRIVATE wholegraph
+            raft::raft
+            rmm::rmm
+            pthread
+  )
+
+  set_target_properties(
+    ${BENCH_NAME}
+    PROPERTIES # set target compile options
+               INSTALL_RPATH "\$ORIGIN/../../../lib"
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
+               POSITION_INDEPENDENT_CODE ON
+               RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${WHOLEGRAPH_BINARY_DIR}/gbench>"
+               INTERFACE_POSITION_INDEPENDENT_CODE ON
+  )
+  target_compile_options(${BENCH_NAME} PUBLIC $<$<COMPILE_LANG_AND_ID:CXX,GNU,Clang>:-Wall -Werror
+    -Wno-error=deprecated-declarations>)
+
+  install(
+    TARGETS ${BENCH_NAME}
+    COMPONENT testing
+    DESTINATION bin/gbench/libwholegraph
+    EXCLUDE_FROM_ALL
+  )
+
+
+endfunction()
+
+if(BUILD_BENCHMARKS) 
+  ConfigureBench(
+    NAME GATHER_SCATTER_BENCH 
+    PATH wholememory_ops/gather_scatter_bench.cu 
+         common/wholegraph_benchmark.cpp
+  )
+
+endif()
diff --git a/cpp/bench/common/wholegraph_benchmark.cpp b/cpp/bench/common/wholegraph_benchmark.cpp
@@ -0,0 +1,132 @@
+#=============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+#include "wholegraph_benchmark.hpp"
+
+#include "wholememory/communicator.hpp"
+#include <cstdint>
+#include <experimental/functional>
+#include <experimental/random>
+#include <wholememory/tensor_description.h>
+#include <wholememory/wholememory.h>
+
+#include <functional>
+#include <string>
+#include <vector>
+
+namespace wholegraph::bench {
+
+template <typename IndexT>
+void host_get_random_integer_indices(void* indices,
+                                     wholememory_array_description_t indice_desc,
+                                     int64_t max_indices)
+{
+  IndexT* indices_ptr = static_cast<IndexT*>(indices);
+  std::experimental::reseed();
+  for (int64_t i = 0; i < indice_desc.size; i++) {
+    IndexT random_index = std::experimental::randint<IndexT>(0, max_indices - 1);
+    indices_ptr[i + indice_desc.storage_offset] = random_index;
+  }
+}
+
+void host_random_init_integer_indices(void* indices,
+                                      wholememory_array_description_t indices_desc,
+                                      int64_t max_indices)
+{
+  if (indices_desc.dtype == WHOLEMEMORY_DT_INT) {
+    host_get_random_integer_indices<int>(indices, indices_desc, max_indices);
+  } else {
+    host_get_random_integer_indices<int64_t>(indices, indices_desc, max_indices);
+  }
+}
+
+void MultiProcessMeasurePerformance(std::function<void()> run_fn,
+                                    wholememory_comm_t& wm_comm,
+                                    const PerformanceMeter& meter,
+                                    const std::function<void()>& barrier_fn)
+{
+  barrier_fn();
+  // warm up
+  struct timeval tv_warmup_s;
+  gettimeofday(&tv_warmup_s, nullptr);
+  int64_t target_warmup_time = 1000LL * 1000LL * meter.warmup_seconds;
+  while (true) {
+    struct timeval tv_warmup_c;
+    gettimeofday(&tv_warmup_c, nullptr);
+    int64_t time_warmup = TIME_DIFF_US(tv_warmup_s, tv_warmup_c);
+    if (time_warmup >= target_warmup_time) break;
+    run_fn();
+    WHOLEMEMORY_CHECK_NOTHROW(cudaDeviceSynchronize() == cudaSuccess);
+  }
+  WHOLEMEMORY_CHECK_NOTHROW(cudaDeviceSynchronize() == cudaSuccess);
+  barrier_fn();
+
+  // run
+  struct timeval tv_run_s, tv_run_e;
+  int64_t max_run_us = 1000LL * 1000LL * meter.max_run_seconds;
+  gettimeofday(&tv_run_s, nullptr);
+  int real_run_count = 0;
+  for (int i = 0; i < meter.run_count; i++) {
+    run_fn();
+    real_run_count++;
+    struct timeval tv_run_c;
+    gettimeofday(&tv_run_c, nullptr);
+    int64_t time_run_used = TIME_DIFF_US(tv_run_s, tv_run_c);
+    if (time_run_used >= max_run_us || real_run_count >= meter.run_count) break;
+    if (meter.sync) { WHOLEMEMORY_CHECK_NOTHROW(cudaDeviceSynchronize() == cudaSuccess); }
+  }
+  WHOLEMEMORY_CHECK_NOTHROW(cudaDeviceSynchronize() == cudaSuccess);
+  gettimeofday(&tv_run_e, nullptr);
+  int64_t real_time_used_us = TIME_DIFF_US(tv_run_s, tv_run_e);
+  double single_run_time_us = real_time_used_us;
+  single_run_time_us /= real_run_count;
+  barrier_fn();
+
+  for (size_t i = 0; i < meter.metrics_.size(); i++) {
+    double metric_value = meter.metrics_[i].value;
+    if (meter.metrics_[i].invert) {
+      metric_value *= single_run_time_us;
+      metric_value /= 1e6;
+    } else {
+      metric_value /= single_run_time_us;
+      metric_value *= 1e6;
+    }
+
+    std::vector<double> recv_vec(wm_comm->world_size);
+    wm_comm->host_allgather(&metric_value, recv_vec.data(), 1, WHOLEMEMORY_DT_DOUBLE);
+    double min_metric, max_metric, avg_metric;
+    min_metric = max_metric = recv_vec[0];
+    avg_metric              = 0.0;
+    for (int j = 0; j < wm_comm->world_size; j++) {
+      min_metric = std::min(min_metric, recv_vec[j]);
+      max_metric = std::max(max_metric, recv_vec[j]);
+      avg_metric += recv_vec[j];
+    }
+    avg_metric /= wm_comm->world_size;
+    if (wm_comm->world_rank == 0) {
+      fprintf(stderr,
+              "== Metric: %20s:  min=%.2lf %s,, max=%.2lf %s,, avg=%.2lf %s\n",
+              meter.metrics_[i].name.c_str(),
+              min_metric,
+              meter.metrics_[i].unit.c_str(),
+              max_metric,
+              meter.metrics_[i].unit.c_str(),
+              avg_metric,
+              meter.metrics_[i].unit.c_str());
+    }
+  }
+}
+
+}  // namespace wholegraph::bench
diff --git a/cpp/bench/common/wholegraph_benchmark.hpp b/cpp/bench/common/wholegraph_benchmark.hpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cstdint>
+#include <cuda_runtime.h>
+#include <sys/time.h>
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "error.hpp"
+
+#include <wholememory/tensor_description.h>
+#include <wholememory/wholememory.h>
+namespace wholegraph::bench {
+
+#define TIME_DIFF_US(TVS, TVE) \
+  ((TVE.tv_sec - TVS.tv_sec) * 1000ULL * 1000ULL + (TVE.tv_usec - TVS.tv_usec))
+
+void host_random_init_integer_indices(void* indices,
+                                      wholememory_array_description_t indices_desc,
+                                      int64_t max_indices);
+
+struct Metric {
+  Metric(const std::string& metrics_name,
+         const std::string& metrics_unit,
+         const double metrics_value,
+         bool inv)
+  {
+    name   = metrics_name;
+    unit   = metrics_unit;
+    value  = metrics_value;
+    invert = inv;
+  }
+  std::string name;
+  std::string unit;
+  double value;
+  bool invert;
+};
+
+struct PerformanceMeter {
+  PerformanceMeter& SetSync()
+  {
+    sync = true;
+    return *this;
+  }
+  bool sync = false;
+
+  PerformanceMeter& SetWarmupTime(float w)
+  {
+    warmup_seconds = w;
+    return *this;
+  }
+  float warmup_seconds = 0.05f;
+
+  std::vector<Metric> metrics_;
+
+  PerformanceMeter& AddMetrics(const std::string& metrics_name,
+                               const std::string& unit,
+                               double value,
+                               bool inv = false)
+  {
+    metrics_.emplace_back(metrics_name, unit, value, inv);
+    return *this;
+  }
+
+  PerformanceMeter& SetRunCount(int count)
+  {
+    run_count = count;
+    return *this;
+  }
+  int run_count = 100;
+
+  PerformanceMeter& SetMaxRunSeconds(float sec)
+  {
+    max_run_seconds = sec;
+    return *this;
+  }
+  float max_run_seconds = 10;
+
+  PerformanceMeter& SetName(const std::string& n)
+  {
+    name = n;
+    return *this;
+  }
+  std::string name;
+};
+
+void MultiProcessMeasurePerformance(std::function<void()> run_fn,
+                                    wholememory_comm_t& wm_comm,
+                                    const PerformanceMeter& meter,
+                                    const std::function<void()>& barrier_fn);
+
+}  // namespace wholegraph::bench