Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

accumulate stwo #535

Merged
merged 18 commits into from
Jun 10, 2024
3 changes: 3 additions & 0 deletions icicle/include/api/babybear.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ extern "C" cudaError_t babybear_mul_cuda(
extern "C" cudaError_t babybear_add_cuda(
babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result);

extern "C" cudaError_t babybear_accumulate_cuda(
babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);

jeremyfelder marked this conversation as resolved.
Show resolved Hide resolved
extern "C" cudaError_t babybear_sub_cuda(
babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result);

Expand Down
3 changes: 3 additions & 0 deletions icicle/include/api/bls12_377.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ extern "C" cudaError_t bls12_377_mul_cuda(
extern "C" cudaError_t bls12_377_add_cuda(
bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result);

extern "C" cudaError_t bls12_377_accumulate_cuda(
bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);

extern "C" cudaError_t bls12_377_sub_cuda(
bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result);

Expand Down
3 changes: 3 additions & 0 deletions icicle/include/api/bls12_381.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ extern "C" cudaError_t bls12_381_mul_cuda(
extern "C" cudaError_t bls12_381_add_cuda(
bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result);

extern "C" cudaError_t bls12_381_accumulate_cuda(
bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);

extern "C" cudaError_t bls12_381_sub_cuda(
bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result);

Expand Down
3 changes: 3 additions & 0 deletions icicle/include/api/bn254.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,9 @@ extern "C" cudaError_t bn254_mul_cuda(
extern "C" cudaError_t bn254_add_cuda(
bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result);

extern "C" cudaError_t bn254_accumulate_cuda(
bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);

extern "C" cudaError_t bn254_sub_cuda(
bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result);

Expand Down
3 changes: 3 additions & 0 deletions icicle/include/api/bw6_761.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ extern "C" cudaError_t bw6_761_mul_cuda(
extern "C" cudaError_t bw6_761_add_cuda(
bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result);

extern "C" cudaError_t bw6_761_accumulate_cuda(
bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);

extern "C" cudaError_t bw6_761_sub_cuda(
bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result);

Expand Down
3 changes: 3 additions & 0 deletions icicle/include/api/grumpkin.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ extern "C" cudaError_t grumpkin_mul_cuda(
extern "C" cudaError_t grumpkin_add_cuda(
grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result);

extern "C" cudaError_t grumpkin_accumulate_cuda(
grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);

extern "C" cudaError_t grumpkin_sub_cuda(
grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result);

Expand Down
3 changes: 3 additions & 0 deletions icicle/include/api/stark252.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ extern "C" cudaError_t stark252_mul_cuda(
extern "C" cudaError_t stark252_add_cuda(
stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result);

extern "C" cudaError_t stark252_accumulate_cuda(
stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);

extern "C" cudaError_t stark252_sub_cuda(
stark252::scalar_t* vec_a, stark252::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, stark252::scalar_t* result);

Expand Down
3 changes: 3 additions & 0 deletions icicle/include/api/templates/fields/vec_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ extern "C" cudaError_t ${FIELD}_mul_cuda(
extern "C" cudaError_t ${FIELD}_add_cuda(
${FIELD}::scalar_t* vec_a, ${FIELD}::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::scalar_t* result);

extern "C" cudaError_t ${FIELD}_accumulate_cuda(
${FIELD}::scalar_t* vec_a, ${FIELD}::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config);

vhnatyk marked this conversation as resolved.
Show resolved Hide resolved
extern "C" cudaError_t ${FIELD}_sub_cuda(
${FIELD}::scalar_t* vec_a, ${FIELD}::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::scalar_t* result);

Expand Down
3 changes: 3 additions & 0 deletions icicle/include/api/templates/fields/vec_ops_ext.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ extern "C" cudaError_t ${FIELD}_extension_mul_cuda(
extern "C" cudaError_t ${FIELD}_extension_add_cuda(
${FIELD}::extension_t* vec_a, ${FIELD}::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::extension_t* result);

extern "C" cudaError_t ${FIELD}_extension_accumulate_cuda(
${FIELD}::extension_t* vec_a, ${FIELD}::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config);

extern "C" cudaError_t ${FIELD}_extension_sub_cuda(
${FIELD}::extension_t* vec_a, ${FIELD}::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, ${FIELD}::extension_t* result);

Expand Down
12 changes: 12 additions & 0 deletions icicle/src/vec_ops/extern.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,18 @@ namespace vec_ops {
return add<scalar_t>(vec_a, vec_b, n, config, result);
}

/**
* Accumulate (as vec_a[i] += vec_b[i]) function with the template parameter
* `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE`
* or standalone "STARK field" given by `-DFIELD`).
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t
CONCAT_EXPAND(FIELD, accumulate_cuda)(scalar_t* vec_a, scalar_t* vec_b, int n, VecOpsConfig& config)
{
return add<scalar_t>(vec_a, vec_b, n, config, vec_a);
}

/**
* Extern version of [Sub](@ref Sub) function with the template parameter
* `E` being the [field](@ref scalar_t) (either scalar field of the curve given by `-DCURVE`
Expand Down
11 changes: 11 additions & 0 deletions icicle/src/vec_ops/extern_extension.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,17 @@ namespace vec_ops {
return add<extension_t>(vec_a, vec_b, n, config, result);
}

/**
* Accumulate (as vec_a[i] += vec_b[i]) function with the template parameter
* `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable during build.
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t
CONCAT_EXPAND(FIELD, extension_accumulate_cuda)(extension_t* vec_a, extension_t* vec_b, int n, VecOpsConfig& config)
{
return add<extension_t>(vec_a, vec_b, n, config, vec_a);
}

/**
* Extern version of [Sub](@ref Sub) function with the template parameter
* `E` being the [extension field](@ref extension_t) of the base field given by `-DFIELD` env variable during build.
Expand Down
31 changes: 21 additions & 10 deletions icicle/src/vec_ops/vec_ops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -82,16 +82,19 @@ namespace vec_ops {
} // namespace

template <typename E, void (*Kernel)(const E*, const E*, int, E*)>
cudaError_t vec_op(const E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
cudaError_t vec_op(E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
{
CHK_INIT_IF_RETURN();

bool is_in_place = vec_a == result;

// Set the grid and block dimensions
int num_threads = MAX_THREADS_PER_BLOCK;
int num_blocks = (n + num_threads - 1) / num_threads;

E *d_result, *d_alloc_vec_a, *d_alloc_vec_b;
const E *d_vec_a, *d_vec_b;
E* d_vec_a;
const E* d_vec_b;
if (!config.is_a_on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
Expand All @@ -109,41 +112,49 @@ namespace vec_ops {
}

if (!config.is_result_on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream));
if (!is_in_place) {
CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream));
} else {
d_result = d_vec_a;
}
} else {
d_result = result;
if (!is_in_place) {
d_result = result;
} else {
d_result = result = d_vec_a;
}
}

// Call the kernel to perform element-wise operation
Kernel<<<num_blocks, num_threads, 0, config.ctx.stream>>>(d_vec_a, d_vec_b, n, d_result);

if (!config.is_a_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); }
if (!config.is_b_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); }

if (!config.is_result_on_device) {
CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream));
}

if (!config.is_a_on_device && !is_in_place) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); }
jeremyfelder marked this conversation as resolved.
Show resolved Hide resolved
if (!config.is_b_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); }

if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(config.ctx.stream));

return CHK_LAST();
}

template <typename E>
cudaError_t mul(const E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
cudaError_t mul(E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
vhnatyk marked this conversation as resolved.
Show resolved Hide resolved
{
return vec_op<E, mul_kernel>(vec_a, vec_b, n, config, result);
}

template <typename E>
cudaError_t add(const E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
cudaError_t add(E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
{
return vec_op<E, add_kernel>(vec_a, vec_b, n, config, result);
}

template <typename E>
cudaError_t sub(const E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
cudaError_t sub(E* vec_a, const E* vec_b, int n, VecOpsConfig& config, E* result)
{
return vec_op<E, sub_kernel>(vec_a, vec_b, n, config, result);
}
Expand Down
3 changes: 2 additions & 1 deletion icicle/tests/runner.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
#include <iostream>

// include list of test files
// Ensure the device_error_test.cu is last to prevent aborting mid-test run
#include "field_test.cu"
#ifdef CURVE_ID
#include "curve_test.cu"
#endif
#include "error_handler_test.cu"

// Ensure the device_error_test.cu is last to prevent aborting mid-test run
#include "device_error_test.cu"

int main(int argc, char** argv)
Expand Down
45 changes: 44 additions & 1 deletion wrappers/rust/icicle-core/src/vec_ops/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,12 @@ pub trait VecOps<F> {
cfg: &VecOpsConfig,
) -> IcicleResult<()>;

fn accumulate(
a: &mut (impl HostOrDeviceSlice<F> + ?Sized),
b: &(impl HostOrDeviceSlice<F> + ?Sized),
cfg: &VecOpsConfig,
) -> IcicleResult<()>;

fn sub(
a: &(impl HostOrDeviceSlice<F> + ?Sized),
b: &(impl HostOrDeviceSlice<F> + ?Sized),
Expand Down Expand Up @@ -207,6 +213,19 @@ where
<<F as FieldImpl>::Config as VecOps<F>>::add(a, b, result, &cfg)
}

pub fn accumulate_scalars<F>(
a: &mut (impl HostOrDeviceSlice<F> + ?Sized),
b: &(impl HostOrDeviceSlice<F> + ?Sized),
cfg: &VecOpsConfig,
) -> IcicleResult<()>
where
F: FieldImpl,
<F as FieldImpl>::Config: VecOps<F>,
{
let cfg = check_vec_ops_args(a, b, a, cfg);
<<F as FieldImpl>::Config as VecOps<F>>::accumulate(a, b, &cfg)
}

pub fn sub_scalars<F>(
a: &(impl HostOrDeviceSlice<F> + ?Sized),
b: &(impl HostOrDeviceSlice<F> + ?Sized),
Expand Down Expand Up @@ -299,6 +318,14 @@ macro_rules! impl_vec_ops_field {
result: *mut $field,
) -> CudaError;

#[link_name = concat!($field_prefix, "_accumulate_cuda")]
pub(crate) fn accumulate_scalars_cuda(
a: *mut $field,
b: *const $field,
size: u32,
cfg: *const VecOpsConfig,
) -> CudaError;

#[link_name = concat!($field_prefix, "_sub_cuda")]
pub(crate) fn sub_scalars_cuda(
a: *const $field,
Expand Down Expand Up @@ -357,6 +384,22 @@ macro_rules! impl_vec_ops_field {
}
}

fn accumulate(
a: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
b: &(impl HostOrDeviceSlice<$field> + ?Sized),
cfg: &VecOpsConfig,
) -> IcicleResult<()> {
unsafe {
$field_prefix_ident::accumulate_scalars_cuda(
a.as_mut_ptr(),
b.as_ptr(),
a.len() as u32,
cfg as *const VecOpsConfig,
)
.wrap()
}
}

fn sub(
a: &(impl HostOrDeviceSlice<$field> + ?Sized),
b: &(impl HostOrDeviceSlice<$field> + ?Sized),
Expand Down Expand Up @@ -457,7 +500,7 @@ macro_rules! impl_vec_add_tests {
) => {
#[test]
pub fn test_vec_add_scalars() {
check_vec_ops_scalars::<$field>()
check_vec_ops_scalars::<$field>();
}

#[test]
Expand Down
12 changes: 10 additions & 2 deletions wrappers/rust/icicle-core/src/vec_ops/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,21 @@ use crate::vec_ops::{
};
use icicle_cuda_runtime::memory::{DeviceVec, HostSlice};

use super::accumulate_scalars;

pub fn check_vec_ops_scalars<F: FieldImpl>()
where
<F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
{
let test_size = 1 << 14;

let a = F::Config::generate_random(test_size);
let mut a = F::Config::generate_random(test_size);
let b = F::Config::generate_random(test_size);
let ones = vec![F::one(); test_size];
let mut result = vec![F::zero(); test_size];
let mut result2 = vec![F::zero(); test_size];
let mut result3 = vec![F::zero(); test_size];
let a = HostSlice::from_slice(&a);
let a = HostSlice::from_mut_slice(&mut a);
let b = HostSlice::from_slice(&b);
let ones = HostSlice::from_slice(&ones);
let result = HostSlice::from_mut_slice(&mut result);
Expand All @@ -34,6 +36,12 @@ where
mul_scalars(a, ones, result3, &cfg).unwrap();

assert_eq!(a[0], result3[0]);

add_scalars(a, b, result, &cfg).unwrap();

accumulate_scalars(a, b, &cfg).unwrap();

assert_eq!(a[0], result[0]);
}

pub fn check_bit_reverse<F: FieldImpl>()
Expand Down
Loading