Merge pull request #463 from kroma-network/perf/use-msm-gpu-when-gene…

…rating-groth16-proof perf: use msm gpu when generating groth16 proof
kroma-network · Jul 8, 2024 · e6e2a65 · e6e2a65
2 parents 5fde7a7 + 34705c8
commit e6e2a65
Show file tree

Hide file tree

Showing 19 changed files with 366 additions and 194 deletions.
diff --git a/tachyon/base/openmp_util.h b/tachyon/base/openmp_util.h
@@ -10,12 +10,18 @@
 
 #if defined(TACHYON_HAS_OPENMP)
 #define CONSTEXPR_IF_NOT_OPENMP
+#define OMP_FOR _Pragma("omp for")
+#define OMP_FOR_NOWAIT _Pragma("omp for nowait")
+#define OMP_PARALLEL _Pragma("omp parallel")
 #define OPENMP_PARALLEL_FOR(expr) _Pragma("omp parallel for") for (expr)
 #define OPENMP_PARALLEL_NESTED_FOR(expr) \
   _Pragma("omp parallel for collapse(2)") for (expr)
 #define OPENMP_FOR(expr) _Pragma("omp for") for (expr)
 #else
 #define CONSTEXPR_IF_NOT_OPENMP constexpr
+#define OMP_FOR
+#define OMP_FOR_NOWAIT
+#define OMP_PARALLEL
 #define OPENMP_PARALLEL_FOR(expr) for (expr)
 #define OPENMP_PARALLEL_NESTED_FOR(expr) for (expr)
 #define OPENMP_FOR(expr) for (expr)

diff --git a/tachyon/math/elliptic_curves/msm/BUILD.bazel b/tachyon/math/elliptic_curves/msm/BUILD.bazel
@@ -82,7 +82,6 @@ tachyon_cuda_unittest(
         ":variable_base_msm_gpu",
         "//tachyon/device/gpu:scoped_mem_pool",
         "//tachyon/device/gpu:scoped_stream",
-        "//tachyon/math/elliptic_curves/bn/bn254:g1",
         "//tachyon/math/elliptic_curves/msm/test:variable_base_msm_test_set",
     ],
 )
diff --git a/tachyon/math/elliptic_curves/msm/algorithms/icicle/BUILD.bazel b/tachyon/math/elliptic_curves/msm/algorithms/icicle/BUILD.bazel
@@ -4,9 +4,18 @@ load("//bazel:tachyon_cc.bzl", "tachyon_cuda_library")
 package(default_visibility = ["//visibility:public"])
 
 tachyon_cuda_library(
-    name = "icicle_msm_bn254",
-    srcs = if_cuda(["icicle_msm_bn254.cc"]),
-    hdrs = ["icicle_msm_bn254.h"],
+    name = "icicle_msm_bn254_g1",
+    srcs = if_cuda(["icicle_msm_bn254_g1.cc"]),
+    hdrs = ["icicle_msm_bn254_g1.h"],
+    deps = ["@icicle//:msm"] + if_cuda([
+        "@local_config_cuda//cuda:cudart_static",
+    ]),
+)
+
+tachyon_cuda_library(
+    name = "icicle_msm_bn254_g2",
+    srcs = if_cuda(["icicle_msm_bn254_g2.cc"]),
+    hdrs = ["icicle_msm_bn254_g2.h"],
     deps = ["@icicle//:msm"] + if_cuda([
         "@local_config_cuda//cuda:cudart_static",
     ]),
@@ -16,9 +25,13 @@ tachyon_cuda_library(
     name = "icicle_msm",
     hdrs = ["icicle_msm.h"],
     deps = [
-        ":icicle_msm_bn254",
+        ":icicle_msm_bn254_g1",
+        ":icicle_msm_bn254_g2",
+        "//tachyon/base:bit_cast",
         "//tachyon/device/gpu:gpu_device_functions",
         "//tachyon/device/gpu:gpu_enums",
         "//tachyon/math/elliptic_curves:points",
+        "//tachyon/math/elliptic_curves/bn/bn254:g1",
+        "//tachyon/math/elliptic_curves/bn/bn254:g2",
     ],
 )
diff --git a/tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm.h b/tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm.h
@@ -5,9 +5,13 @@
 
 #include "third_party/icicle/include/fields/id.h"
 
+#include "tachyon/base/bit_cast.h"
 #include "tachyon/device/gpu/gpu_device_functions.h"
 #include "tachyon/device/gpu/gpu_enums.h"
-#include "tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_bn254.h"
+#include "tachyon/math/elliptic_curves/bn/bn254/g1.h"
+#include "tachyon/math/elliptic_curves/bn/bn254/g2.h"
+#include "tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_bn254_g1.h"
+#include "tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_bn254_g2.h"
 #include "tachyon/math/elliptic_curves/projective_point.h"
 
 namespace tachyon::math {
@@ -44,40 +48,70 @@ class IcicleMSM {
   template <typename BaseContainer, typename ScalarContainer>
   [[nodiscard]] bool Run(const BaseContainer& cpu_bases,
                          const ScalarContainer& cpu_scalars,
-                         ProjectivePoint<Curve>* cpu_result) {
-#if FIELD_ID != BN254
-#error Only Bn254 is supported
-#endif
-
-    using BaseField = typename Point::BaseField;
-    using BigInt = typename Point::BaseField::BigIntTy;
-
-    size_t bases_size = std::size(cpu_bases);
-    size_t scalars_size = std::size(cpu_scalars);
-
-    if (bases_size != scalars_size) {
-      LOG(ERROR) << "bases_size and scalars_size don't match";
-      return false;
-    }
-
-    ::bn254::projective_t ret;
-    gpuError_t error = tachyon_bn254_msm_cuda(
-        reinterpret_cast<const ::bn254::scalar_t*>(std::data(cpu_scalars)),
-        reinterpret_cast<const ::bn254::affine_t*>(std::data(cpu_bases)),
-        bases_size, *config_, &ret);
-    if (error != gpuSuccess) return false;
-    *cpu_result = {BaseField(reinterpret_cast<const BigInt&>(ret.x)),
-                   BaseField(reinterpret_cast<const BigInt&>(ret.y)),
-                   BaseField(reinterpret_cast<const BigInt&>(ret.z))};
-    return true;
-  }
+                         ProjectivePoint<Curve>* cpu_result);
 
  private:
   gpuMemPool_t mem_pool_ = nullptr;
   gpuStream_t stream_ = nullptr;
   std::unique_ptr<::msm::MSMConfig> config_;
 };
 
+template <>
+template <typename BaseContainer, typename ScalarContainer>
+bool IcicleMSM<bn254::G1AffinePoint>::Run(const BaseContainer& cpu_bases,
+                                          const ScalarContainer& cpu_scalars,
+                                          ProjectivePoint<Curve>* cpu_result) {
+#if FIELD_ID != BN254
+#error Only Bn254 is supported
+#endif
+
+  size_t bases_size = std::size(cpu_bases);
+  size_t scalars_size = std::size(cpu_scalars);
+
+  if (bases_size != scalars_size) {
+    LOG(ERROR) << "bases_size and scalars_size don't match";
+    return false;
+  }
+
+  ::bn254::projective_t ret;
+  gpuError_t error = tachyon_bn254_g1_msm_cuda(
+      reinterpret_cast<const ::bn254::scalar_t*>(std::data(cpu_scalars)),
+      reinterpret_cast<const ::bn254::affine_t*>(std::data(cpu_bases)),
+      bases_size, *config_, &ret);
+  if (error != gpuSuccess) return false;
+  ret = ::bn254::projective_t::to_montgomery(ret);
+  *cpu_result = base::bit_cast<ProjectivePoint<Curve>>(ret);
+  return true;
+}
+
+template <>
+template <typename BaseContainer, typename ScalarContainer>
+bool IcicleMSM<bn254::G2AffinePoint>::Run(const BaseContainer& cpu_bases,
+                                          const ScalarContainer& cpu_scalars,
+                                          ProjectivePoint<Curve>* cpu_result) {
+#if FIELD_ID != BN254
+#error Only Bn254 is supported
+#endif
+
+  size_t bases_size = std::size(cpu_bases);
+  size_t scalars_size = std::size(cpu_scalars);
+
+  if (bases_size != scalars_size) {
+    LOG(ERROR) << "bases_size and scalars_size don't match";
+    return false;
+  }
+
+  ::bn254::g2_projective_t ret;
+  gpuError_t error = tachyon_bn254_g2_msm_cuda(
+      reinterpret_cast<const ::bn254::scalar_t*>(std::data(cpu_scalars)),
+      reinterpret_cast<const ::bn254::g2_affine_t*>(std::data(cpu_bases)),
+      bases_size, *config_, &ret);
+  if (error != gpuSuccess) return false;
+  ret = ::bn254::g2_projective_t::to_montgomery(ret);
+  *cpu_result = base::bit_cast<ProjectivePoint<Curve>>(ret);
+  return true;
+}
+
 }  // namespace tachyon::math
 
 #endif  // TACHYON_MATH_ELLIPTIC_CURVES_MSM_ALGORITHMS_ICICLE_ICICLE_MSM_H_
diff --git a/tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_bn254.cc b/tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_bn254.cc
diff --git a/tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_bn254.h b/tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_bn254.h
diff --git a/tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_bn254_g1.cc b/tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_bn254_g1.cc
@@ -0,0 +1,10 @@
+#include "tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_bn254_g1.h"
+
+#include "third_party/icicle/src/msm/msm.cu.cc"  // NOLINT(build/include)
+
+cudaError_t tachyon_bn254_g1_msm_cuda(const ::bn254::scalar_t* scalars,
+                                      const ::bn254::affine_t* points,
+                                      int msm_size, ::msm::MSMConfig& config,
+                                      ::bn254::projective_t* out) {
+  return ::msm::msm(scalars, points, msm_size, config, out);
+}
diff --git a/tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_bn254_g1.h b/tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_bn254_g1.h
@@ -0,0 +1,11 @@
+#ifndef TACHYON_MATH_ELLIPTIC_CURVES_MSM_ALGORITHMS_ICICLE_ICICLE_MSM_BN254_G1_H_
+#define TACHYON_MATH_ELLIPTIC_CURVES_MSM_ALGORITHMS_ICICLE_ICICLE_MSM_BN254_G1_H_
+
+#include "third_party/icicle/include/curves/params/bn254.cu.h"
+#include "third_party/icicle/include/msm/msm.cu.h"
+
+extern "C" cudaError_t tachyon_bn254_g1_msm_cuda(
+    const ::bn254::scalar_t* scalars, const ::bn254::affine_t* points,
+    int msm_size, ::msm::MSMConfig& config, ::bn254::projective_t* out);
+
+#endif  // TACHYON_MATH_ELLIPTIC_CURVES_MSM_ALGORITHMS_ICICLE_ICICLE_MSM_BN254_G1_H_
diff --git a/tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_bn254_g2.cc b/tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_bn254_g2.cc
@@ -0,0 +1,10 @@
+#include "tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_bn254_g2.h"
+
+#include "third_party/icicle/src/msm/msm.cu.cc"  // NOLINT(build/include)
+
+cudaError_t tachyon_bn254_g2_msm_cuda(const ::bn254::scalar_t* scalars,
+                                      const ::bn254::g2_affine_t* points,
+                                      int msm_size, ::msm::MSMConfig& config,
+                                      ::bn254::g2_projective_t* out) {
+  return ::msm::msm(scalars, points, msm_size, config, out);
+}
diff --git a/tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_bn254_g2.h b/tachyon/math/elliptic_curves/msm/algorithms/icicle/icicle_msm_bn254_g2.h
@@ -0,0 +1,11 @@
+#ifndef TACHYON_MATH_ELLIPTIC_CURVES_MSM_ALGORITHMS_ICICLE_ICICLE_MSM_BN254_G2_H_
+#define TACHYON_MATH_ELLIPTIC_CURVES_MSM_ALGORITHMS_ICICLE_ICICLE_MSM_BN254_G2_H_
+
+#include "third_party/icicle/include/curves/params/bn254.cu.h"
+#include "third_party/icicle/include/msm/msm.cu.h"
+
+extern "C" cudaError_t tachyon_bn254_g2_msm_cuda(
+    const ::bn254::scalar_t* scalars, const ::bn254::g2_affine_t* points,
+    int msm_size, ::msm::MSMConfig& config, ::bn254::g2_projective_t* out);
+
+#endif  // TACHYON_MATH_ELLIPTIC_CURVES_MSM_ALGORITHMS_ICICLE_ICICLE_MSM_BN254_G2_H_
diff --git a/tachyon/math/elliptic_curves/msm/variable_base_msm_gpu_unittest.cc b/tachyon/math/elliptic_curves/msm/variable_base_msm_gpu_unittest.cc
@@ -7,7 +7,6 @@
 #include "tachyon/device/gpu/gpu_enums.h"
 #include "tachyon/device/gpu/scoped_mem_pool.h"
 #include "tachyon/device/gpu/scoped_stream.h"
-#include "tachyon/math/elliptic_curves/bn/bn254/g1.h"
 #include "tachyon/math/elliptic_curves/msm/test/variable_base_msm_test_set.h"
 
 namespace tachyon::math {
@@ -18,15 +17,18 @@ constexpr size_t kThreadNum = 32;
 
 using namespace device;
 
+template <typename Point>
 class VariableMSMCorrectnessGpuTest : public testing::Test {
  public:
+  using Curve = typename Point::Curve;
+
   constexpr static size_t kLogCount = 10;
   constexpr static size_t kCount = 1 << kLogCount;
 
   static void SetUpTestSuite() {
-    bn254::G1Curve::Init();
+    Point::Curve::Init();
 
-    test_set_ = VariableBaseMSMTestSet<bn254::G1AffinePoint>::Random(
+    test_set_ = VariableBaseMSMTestSet<Point>::Random(
         kCount, VariableBaseMSMMethod::kMSM);
 
     expected_ = test_set_.answer.ToProjective();
@@ -35,17 +37,25 @@ class VariableMSMCorrectnessGpuTest : public testing::Test {
   static void TearDownTestSuite() { GPU_MUST_SUCCESS(gpuDeviceReset(), ""); }
 
  protected:
-  static VariableBaseMSMTestSet<bn254::G1AffinePoint> test_set_;
-  static bn254::G1ProjectivePoint expected_;
+  static VariableBaseMSMTestSet<Point> test_set_;
+  static ProjectivePoint<Curve> expected_;
 };
 
-VariableBaseMSMTestSet<bn254::G1AffinePoint>
-    VariableMSMCorrectnessGpuTest::test_set_;
-bn254::G1ProjectivePoint VariableMSMCorrectnessGpuTest::expected_;
+template <typename Point>
+VariableBaseMSMTestSet<Point> VariableMSMCorrectnessGpuTest<Point>::test_set_;
+template <typename Point>
+ProjectivePoint<typename Point::Curve>
+    VariableMSMCorrectnessGpuTest<Point>::expected_;
 
 }  // namespace
 
-TEST_F(VariableMSMCorrectnessGpuTest, MSM) {
+using PointTypes = testing::Types<bn254::G1AffinePoint, bn254::G2AffinePoint>;
+TYPED_TEST_SUITE(VariableMSMCorrectnessGpuTest, PointTypes);
+
+TYPED_TEST(VariableMSMCorrectnessGpuTest, MSM) {
+  using Point = TypeParam;
+  using Curve = typename Point::Curve;
+
   gpuMemPoolProps props = {gpuMemAllocationTypePinned,
                            gpuMemHandleTypeNone,
                            {gpuMemLocationTypeDevice, 0}};
@@ -58,11 +68,11 @@ TEST_F(VariableMSMCorrectnessGpuTest, MSM) {
 
   gpu::ScopedStream stream = gpu::CreateStream();
 
-  VariableBaseMSMGpu<bn254::G1AffinePoint> msm_gpu(mem_pool.get(),
-                                                   stream.get());
-  bn254::G1ProjectivePoint actual;
-  ASSERT_TRUE(msm_gpu.Run(test_set_.bases, test_set_.scalars, &actual));
-  EXPECT_EQ(actual, expected_);
+  VariableBaseMSMGpu<Point> msm_gpu(mem_pool.get(), stream.get());
+  ProjectivePoint<Curve> actual;
+  ASSERT_TRUE(
+      msm_gpu.Run(this->test_set_.bases, this->test_set_.scalars, &actual));
+  EXPECT_EQ(actual, this->expected_);
 }
 
 }  // namespace tachyon::math
diff --git a/tachyon/zk/r1cs/constraint_system/quadratic_arithmetic_program.h b/tachyon/zk/r1cs/constraint_system/quadratic_arithmetic_program.h
@@ -26,19 +26,15 @@ namespace tachyon::zk::r1cs {
 template <typename F>
 F EvaluateConstraint(const std::vector<Cell<F>>& cells,
                      absl::Span<const F> assignments) {
-  std::vector<F> sums = base::ParallelizeMap(
-      cells, [assignments](absl::Span<const Cell<F>> chunk) {
-        F sum;
-        for (const Cell<F>& cell : chunk) {
-          if (cell.coefficient.IsOne()) {
-            sum += assignments[cell.index];
-          } else {
-            sum += assignments[cell.index] * cell.coefficient;
-          }
-        }
-        return sum;
-      });
-  return std::accumulate(sums.begin(), sums.end(), F::Zero(), std::plus<>());
+  F sum;
+  for (const Cell<F>& cell : cells) {
+    if (cell.coefficient.IsOne()) {
+      sum += assignments[cell.index];
+    } else {
+      sum += assignments[cell.index] * cell.coefficient;
+    }
+  }
+  return sum;
 }
 
 template <typename F>
@@ -142,10 +138,21 @@ class QuadraticArithmeticProgram {
     //        = 0                      (otherwise)
     // where x is |full_assignments|.
     // clang-format on
-    OPENMP_PARALLEL_FOR(size_t i = 0; i < matrices.num_constraints; ++i) {
-      a[i] = EvaluateConstraint(matrices.a[i], full_assignments);
-      b[i] = EvaluateConstraint(matrices.b[i], full_assignments);
-      c[i] = EvaluateConstraint(matrices.c[i], full_assignments);
+    OMP_PARALLEL {
+      OMP_FOR_NOWAIT
+      for (size_t i = 0; i < matrices.num_constraints; ++i) {
+        a[i] = EvaluateConstraint(matrices.a[i], full_assignments);
+      }
+
+      OMP_FOR_NOWAIT
+      for (size_t i = 0; i < matrices.num_constraints; ++i) {
+        b[i] = EvaluateConstraint(matrices.b[i], full_assignments);
+      }
+
+      OMP_FOR
+      for (size_t i = 0; i < matrices.num_constraints; ++i) {
+        c[i] = EvaluateConstraint(matrices.c[i], full_assignments);
+      }
     }
 
     for (size_t i = matrices.num_constraints;

diff --git a/tachyon/zk/r1cs/groth16/BUILD.bazel b/tachyon/zk/r1cs/groth16/BUILD.bazel
@@ -38,7 +38,10 @@ tachyon_cc_library(
         ":proof",
         ":proving_key",
         "//tachyon/base:optional",
+        "//tachyon/device/gpu:scoped_mem_pool",
+        "//tachyon/device/gpu:scoped_stream",
         "//tachyon/math/elliptic_curves/msm:variable_base_msm",
+        "//tachyon/math/elliptic_curves/msm:variable_base_msm_gpu",
         "//tachyon/zk/r1cs/constraint_system:qap_witness_map_result",
     ],
 )