diff --git a/docs/docs/icicle/golang-bindings/vec-ops.md b/docs/docs/icicle/golang-bindings/vec-ops.md index e93d9a0a2..e219ec26d 100644 --- a/docs/docs/icicle/golang-bindings/vec-ops.md +++ b/docs/docs/icicle/golang-bindings/vec-ops.md @@ -4,8 +4,8 @@ Icicle exposes a number of vector operations which a user can use: -* The VecOps API provides efficient vector operations such as addition, subtraction, and multiplication. -* MatrixTranspose API allows a user to perform a transpose on a vector representation of a matrix +* The VecOps API provides efficient vector operations such as addition, subtraction, and multiplication, supporting both single and batched operations. +* MatrixTranspose API allows a user to perform a transpose on a vector representation of a matrix, with support for batched transpositions. ## VecOps API Documentation @@ -121,6 +121,8 @@ type VecOpsConfig struct { isBOnDevice bool isResultOnDevice bool IsAsync bool + batch_size int + columns_batch bool Ext config_extension.ConfigExtensionHandler } ``` @@ -132,6 +134,8 @@ type VecOpsConfig struct { - **`isBOnDevice`**: Indicates if vector `b` is located on the device. - **`isResultOnDevice`**: Specifies where the result vector should be stored (device or host memory). - **`IsAsync`**: Controls whether the vector operation runs asynchronously. +- **`batch_size`**: Number of vectors (or operations) to process in a batch. Each vector operation will be performed independently on each batch element. +- **`columns_batch`**: true if the batched vectors are stored as columns in a 2D array (i.e., the vectors are strided in memory as columns of a matrix). If false, the batched vectors are stored contiguously in memory (e.g., as rows or in a flat array). - **`Ext`**: Extended configuration for backend. #### Default Configuration @@ -148,6 +152,8 @@ This section describes the functionality of the `TransposeMatrix` function used The function takes a matrix represented as a 1D slice and transposes it, storing the result in another 1D slice. +If VecOpsConfig specifies a batch_size greater than one, the transposition is performed on multiple matrices simultaneously, producing corresponding transposed matrices. The storage arrangement of batched matrices is determined by the columns_batch field in the VecOpsConfig. + ### Function ```go diff --git a/docs/docs/icicle/primitives/vec_ops.md b/docs/docs/icicle/primitives/vec_ops.md index e9e10c1a9..7f546dc16 100644 --- a/docs/docs/icicle/primitives/vec_ops.md +++ b/docs/docs/icicle/primitives/vec_ops.md @@ -16,6 +16,8 @@ The `VecOpsConfig` struct is a configuration object used to specify parameters f - **`is_b_on_device: bool`**: Indicates whether the second input vector (`b`) is already on the device. If `false`, the vector will be copied from the host to the device. This field is optional. - **`is_result_on_device: bool`**: Indicates whether the result should be stored on the device. If `false`, the result will be transferred back to the host. - **`is_async: bool`**: Specifies whether the vector operation should be performed asynchronously. When `true`, the operation will not block the CPU, allowing other operations to proceed concurrently. Asynchronous execution requires careful synchronization to ensure data integrity. +- **`batch_size: int`**: Number of vectors (or operations) to process in a batch. Each vector operation will be performed independently on each batch element. +- **`columns_batch: bool`**: True if the batched vectors are stored as columns in a 2D array (i.e., the vectors are strided in memory as columns of a matrix). If false, the batched vectors are stored contiguously in memory (e.g., as rows or in a flat array). - **`ext: ConfigExtension*`**: Backend-specific extensions. #### Default Configuration @@ -28,6 +30,9 @@ static VecOpsConfig default_vec_ops_config() { false, // is_b_on_device false, // is_result_on_device false, // is_async + 1, // batch_size + false, // columns_batch + nullptr // ext }; return config; } @@ -35,7 +40,7 @@ static VecOpsConfig default_vec_ops_config() { ### Element-wise Operations -These functions perform element-wise operations on two input vectors `a` and `b`, producing an output vector. +These functions perform element-wise operations on two input vectors a and b. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple pairs of vectors simultaneously, producing corresponding output vectors. #### `vector_add` @@ -90,9 +95,31 @@ template eIcicleError convert_montgomery(const T* input, uint64_t size, bool is_into, const VecOpsConfig& config, T* output); ``` +### Reduction operations + +These functions perform reduction operations on vectors. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple vectors simultaneously, producing corresponding output values. The storage arrangement of batched vectors is determined by the columns_batch field in the VecOpsConfig. + +#### `vector_sum` + +Computes the sum of all elements in each vector in a batch. + +```cpp +template +eIcicleError vector_sum(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output); +``` + +#### `vector_product` + +Computes the product of all elements in each vector in a batch. + +```cpp +template +eIcicleError vector_product(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output); +``` + ### Scalar-Vector Operations -These functions apply a scalar operation to each element of a vector. +These functions apply a scalar operation to each element of a vector. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple vector-scalar pairs simultaneously, producing corresponding output vectors. #### `scalar_add_vec / scalar_sub_vec` @@ -123,7 +150,7 @@ eIcicleError scalar_mul_vec(const T* scalar_a, const T* vec_b, uint64_t size, co ### Matrix Operations -These functions perform operations on matrices. +These functions perform operations on matrices. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple matrices simultaneously, producing corresponding output matrices. #### `matrix_transpose` @@ -138,7 +165,7 @@ eIcicleError matrix_transpose(const T* mat_in, uint32_t nof_rows, uint32_t nof_c #### `bit_reverse` -Reorders the vector elements based on a bit-reversal pattern. +Reorders the vector elements based on a bit-reversal pattern. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously. ```cpp template @@ -147,16 +174,16 @@ eIcicleError bit_reverse(const T* vec_in, uint64_t size, const VecOpsConfig& con #### `slice` -Extracts a slice from a vector. +Extracts a slice from a vector. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously, producing corresponding output vectors. ```cpp template -eIcicleError slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size, const VecOpsConfig& config, T* vec_out); +eIcicleError slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size_in, uint64_t size_out, const VecOpsConfig& config, T* vec_out); ``` #### `highest_non_zero_idx` -Finds the highest non-zero index in a vector. +Finds the highest non-zero index in a vector. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously. ```cpp template @@ -165,7 +192,7 @@ eIcicleError highest_non_zero_idx(const T* vec_in, uint64_t size, const VecOpsCo #### `polynomial_eval` -Evaluates a polynomial at given domain points. +Evaluates a polynomial at given domain points. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously. ```cpp template @@ -174,7 +201,7 @@ eIcicleError polynomial_eval(const T* coeffs, uint64_t coeffs_size, const T* dom #### `polynomial_division` -Divides two polynomials. +Divides two polynomials. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously. ```cpp template diff --git a/docs/docs/icicle/programmers_guide/general.md b/docs/docs/icicle/programmers_guide/general.md index b02cd2f9c..0bef2b850 100644 --- a/docs/docs/icicle/programmers_guide/general.md +++ b/docs/docs/icicle/programmers_guide/general.md @@ -21,6 +21,7 @@ The configuration struct allows users to modify settings such as: - Specifying whether inputs and outputs are on the host or device. - Adjusting the data layout for specific optimizations. +- Setting batching parameters (batch_size and columns_batch) to perform operations on multiple data sets simultaneously. - Passing custom options to the backend implementation through an extension mechanism, such as setting the number of CPU cores to use. ### Example (C++) @@ -31,6 +32,8 @@ The configuration struct allows users to modify settings such as: // Create config struct for vector add VecOpsConfig config = default_vec_ops_config(); // optionally modify the config struct here +config.batch_size = 4; // Process 4 vector operations in a batch +config.columns_batch = true; // Batched vectors are stored as columns // Call the API eIcicleError err = vector_add(vec_a, vec_b, size, config, vec_res); @@ -45,6 +48,8 @@ struct VecOpsConfig { bool is_b_on_device; /**< True if `b` is on the device, false if it is not. Default value: false. OPTIONAL. */ bool is_result_on_device; /**< If true, the output is preserved on the device, otherwise on the host. Default value: false. */ bool is_async; /**< Whether to run the vector operations asynchronously. */ + int batch_size; /**< Number of vector operations to process in a batch. Default value: 1. */ + bool columns_batch; /**< True if batched vectors are stored as columns; false if stored contiguously. Default value: false. */ ConfigExtension* ext = nullptr; /**< Backend-specific extension. */ }; ``` diff --git a/docs/docs/icicle/rust-bindings/vec-ops.md b/docs/docs/icicle/rust-bindings/vec-ops.md index 61aa71570..c42caafb5 100644 --- a/docs/docs/icicle/rust-bindings/vec-ops.md +++ b/docs/docs/icicle/rust-bindings/vec-ops.md @@ -1,10 +1,10 @@ # Vector Operations API -Our vector operations API includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory. +Our vector operations API includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory, as well as batched operations. ## Vector Operations Configuration -The `VecOpsConfig` struct encapsulates the settings for vector operations, including device context and operation modes. +The `VecOpsConfig` struct encapsulates the settings for vector operations, including device context, operation modes, and batching parameters. ### `VecOpsConfig` @@ -17,6 +17,8 @@ pub struct VecOpsConfig { pub is_b_on_device: bool, pub is_result_on_device: bool, pub is_async: bool, + pub batch_size: usize, + pub columns_batch: bool, pub ext: ConfigExtension, } ``` @@ -28,6 +30,9 @@ pub struct VecOpsConfig { - **`is_b_on_device: bool`**: Indicates whether the input b data has been preloaded on the device memory. If `false` inputs will be copied from host to device. - **`is_result_on_device: bool`**: Indicates whether the output data is preloaded in device memory. If `false` outputs will be copied from host to device. - **`is_async: bool`**: Specifies whether the NTT operation should be performed asynchronously. +- **`batch_size: usize`**: Number of vector operations to process in a single batch. Each operation will be performed independently on each batch element. +- **`columns_batch: bool`**: true if the batched vectors are stored as columns in a 2D array (i.e., the vectors are strided in memory as columns of a matrix). If false, the batched vectors are stored contiguously in memory (e.g., as rows or in a flat array). + - **`ext: ConfigExtension`**: extended configuration for backend. ### Default Configuration @@ -40,11 +45,11 @@ let cfg = VecOpsConfig::default(); ## Vector Operations -Vector operations are implemented through the `VecOps` trait, providing methods for addition, subtraction, and multiplication of vectors. +Vector operations are implemented through the `VecOps` trait, providing methods for addition, subtraction, and multiplication of vectors. These methods support both single and batched operations based on the batch_size and columns_batch configurations. ### Methods -All operations are element-wise operations, and the results placed into the `result` param. These operations are not in place. +All operations are element-wise operations, and the results placed into the `result` param. These operations are not in place, except for accumulate. - **`add`**: Computes the element-wise sum of two vectors. - **`accumulate`**: Sum input b to a inplace. diff --git a/examples/c++/polynomial-multiplication/example.cpp b/examples/c++/polynomial-multiplication/example.cpp index 9bd90b842..1fdfeb501 100644 --- a/examples/c++/polynomial-multiplication/example.cpp +++ b/examples/c++/polynomial-multiplication/example.cpp @@ -69,21 +69,18 @@ int main(int argc, char** argv) ICICLE_CHECK(bn254_ntt(polyB.get(), NTT_SIZE, NTTDir::kForward, &ntt_config, d_polyB)); // (4) multiply A,B - VecOpsConfig config{ - nullptr, - true, // is_a_on_device - true, // is_b_on_device - true, // is_result_on_device - false, // is_async - nullptr // ext - }; - ICICLE_CHECK(bn254_vector_mul(d_polyA, d_polyB, NTT_SIZE, &config, d_polyRes)); + VecOpsConfig config = default_vec_ops_config(); + config.is_a_on_device = true; + config.is_b_on_device = true; + config.is_result_on_device = true; + + ICICLE_CHECK(vector_mul(d_polyA, d_polyB, NTT_SIZE, config, d_polyRes)); // (5) INTT (in place) ntt_config.are_inputs_on_device = true; ntt_config.are_outputs_on_device = true; ntt_config.ordering = Ordering::kMN; - ICICLE_CHECK(bn254_ntt(d_polyRes, NTT_SIZE, NTTDir::kInverse, &ntt_config, d_polyRes)); + ICICLE_CHECK(ntt(d_polyRes, NTT_SIZE, NTTDir::kInverse, ntt_config, d_polyRes)); if (print) { END_TIMER(poly_multiply, "polynomial multiplication took"); } diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index 3a2156d60..22c257023 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -6,6 +6,9 @@ #include "icicle/fields/field_config.h" #include "tasks_manager.h" +#include +#include +#include using namespace field_config; using namespace icicle; @@ -17,15 +20,17 @@ enum VecOperation { VECTOR_SUB, VECTOR_MUL, VECTOR_DIV, + CONVERT_TO_MONTGOMERY, + CONVERT_FROM_MONTGOMERY, VECTOR_SUM, VECTOR_PRODUCT, SCALAR_ADD_VEC, SCALAR_SUB_VEC, SCALAR_MUL_VEC, - CONVERT_TO_MONTGOMERY, - CONVERT_FROM_MONTGOMERY, BIT_REVERSE, SLICE, + REPLACE_ELEMENTS, + OUT_OF_PLACE_MATRIX_TRANSPOSE, NOF_OPERATIONS }; @@ -46,18 +51,25 @@ class VectorOpTask : public TaskBase VectorOpTask() : TaskBase() {} // Set the operands to execute a task of 2 operands and 1 output and dispatch the task - void send_2ops_task(VecOperation operation, const int nof_operations, const T* op_a, const T* op_b, T* output) + void send_2ops_task( + VecOperation operation, + const uint32_t nof_operations, + const T* op_a, + const T* op_b, + const uint32_t stride, + T* output) { m_operation = operation; m_nof_operations = nof_operations; m_op_a = op_a; m_op_b = op_b; + m_stride = stride; m_output = output; dispatch(); } // Set the operands to execute a task of 1 operand and 1 output and dispatch the task - void send_1op_task(VecOperation operation, const int nof_operations, const T* op_a, T* output) + void send_1op_task(VecOperation operation, const uint32_t nof_operations, const T* op_a, T* output) { m_operation = operation; m_nof_operations = nof_operations; @@ -66,34 +78,94 @@ class VectorOpTask : public TaskBase dispatch(); } // Set the operands to execute a task of 1 operand and dispatch the task - void send_intermidiate_res_task(VecOperation operation, const int nof_operations, const T* op_a) + void + send_intermidiate_res_task(VecOperation operation, const uint64_t stop_index, const T* op_a, const uint64_t stride) { m_operation = operation; - m_nof_operations = nof_operations; + m_stop_index = stop_index; m_op_a = op_a; + m_stride = stride; dispatch(); } - // Set the operands to bitrev operation dispatch the task - void send_bitrev_task( - VecOperation operation, int bit_size, uint64_t start_index, const int nof_operations, const T* op_a, T* output) + // Set the operands for bit_reverse operation and dispatch the task + void send_bit_reverse_task( + VecOperation operation, + uint32_t bit_size, + uint64_t start_index, + const uint32_t nof_operations, + const T* op_a, + const uint64_t stride, + T* output) { m_operation = operation; + m_bit_size = bit_size; + m_start_index = start_index; m_nof_operations = nof_operations; m_op_a = op_a; + m_stride = stride; m_output = output; - m_bit_size = bit_size, m_start_index = start_index; dispatch(); } - // Set the operands to slice operation dispatch the task - void send_slice_task(VecOperation operation, uint64_t stride, const int nof_operations, const T* op_a, T* output) + // Set the operands for slice operation and dispatch the task + void send_slice_task( + VecOperation operation, + uint64_t stride, + uint64_t stride_out, + const uint32_t nof_operations, + const T* op_a, + T* output) { m_operation = operation; m_nof_operations = nof_operations; m_op_a = op_a; m_output = output; m_stride = stride; + m_stride_out = stride_out; + dispatch(); + } + + // Set the operands for replace_elements operation and dispatch the task + void send_replace_elements_task( + VecOperation operation, + const T* mat_in, + const uint32_t nof_operations, + std::vector& start_indices_in_mat, + uint64_t start_index, + uint32_t log_nof_rows, + uint32_t log_nof_cols, + const uint32_t stride, + T* mat_out) + { + m_operation = operation; + m_op_a = mat_in; + m_nof_operations = nof_operations; + m_start_indices_in_mat = &start_indices_in_mat; + m_start_index = start_index; // start index in start_indices vector + m_log_nof_rows = log_nof_rows; + m_log_nof_cols = log_nof_cols; + m_stride = stride; + m_output = mat_out; + dispatch(); + } + + void send_out_of_place_matrix_transpose_task( + VecOperation operation, + const T* mat_in, + const uint32_t nof_operations, + const uint32_t nof_rows, + const uint32_t nof_cols, + const uint32_t stride, + T* mat_out) + { + m_operation = operation; + m_op_a = mat_in; + m_nof_operations = nof_operations; + m_nof_rows = nof_rows; + m_nof_cols = nof_cols; + m_stride = stride; + m_output = mat_out; dispatch(); } @@ -130,56 +202,55 @@ class VectorOpTask : public TaskBase m_output[i] = m_op_a[i] * T::inverse(m_op_b[i]); } } - // Single worker functionality to execute scalar + vector - void scalar_add_vec() - { - for (uint64_t i = 0; i < m_nof_operations; ++i) { - m_output[i] = *m_op_a + m_op_b[i]; - } - } - // Single worker functionality to execute scalar - vector - void scalar_sub_vec() + // Single worker functionality to execute conversion from barret to montgomery + void convert_to_montgomery() { for (uint64_t i = 0; i < m_nof_operations; ++i) { - m_output[i] = *m_op_a + m_op_b[i]; + m_output[i] = T::to_montgomery(m_op_a[i]); } } - // Single worker functionality to execute scalar * vector - void scalar_mul_vec() + // Single worker functionality to execute conversion from montgomery to barret + void convert_from_montgomery() { for (uint64_t i = 0; i < m_nof_operations; ++i) { - m_output[i] = *m_op_a * m_op_b[i]; + m_output[i] = T::from_montgomery(m_op_a[i]); } } // Single worker functionality to execute sum(vector) void vector_sum() { - *m_output = m_op_a[0]; - for (uint64_t i = 1; i < m_nof_operations; ++i) { - *m_output = *m_output + m_op_a[i]; + m_intermidiate_res = T::zero(); + for (uint64_t i = 0; i < (m_stop_index * m_stride); i = i + m_stride) { + m_intermidiate_res = m_intermidiate_res + m_op_a[i]; } } // Single worker functionality to execute product(vector) void vector_product() { - *m_output = m_op_a[0]; - for (uint64_t i = 1; i < m_nof_operations; ++i) { - *m_output = *m_output * m_op_a[i]; + m_intermidiate_res = T::one(); + for (uint64_t i = 0; i < (m_stop_index * m_stride); i = i + m_stride) { + m_intermidiate_res = m_intermidiate_res * m_op_a[i]; } } - // Single worker functionality to execute conversion from barret to montgomery - void convert_to_montgomery() + // Single worker functionality to execute scalar + vector + void scalar_add_vec() { for (uint64_t i = 0; i < m_nof_operations; ++i) { - m_output[i] = T::to_montgomery(m_op_a[i]); + m_output[m_stride * i] = *m_op_a + m_op_b[m_stride * i]; } } - - // Single worker functionality to execute conversion from montgomery to barret - void convert_from_montgomery() + // Single worker functionality to execute scalar - vector + void scalar_sub_vec() { for (uint64_t i = 0; i < m_nof_operations; ++i) { - m_output[i] = T::from_montgomery(m_op_a[i]); + m_output[m_stride * i] = *m_op_a - m_op_b[m_stride * i]; + } + } + // Single worker functionality to execute scalar * vector + void scalar_mul_vec() + { + for (uint64_t i = 0; i < m_nof_operations; ++i) { + m_output[m_stride * i] = *m_op_a * m_op_b[m_stride * i]; } } // Single worker functionality to execute bit reverse reorder @@ -200,10 +271,10 @@ class VectorOpTask : public TaskBase if (m_output == m_op_a) { // inplace calculation if (rev_idx < idx) { // only on of the threads need to work - std::swap(m_output[idx], m_output[rev_idx]); + std::swap(m_output[m_stride * idx], m_output[m_stride * rev_idx]); } - } else { // out of place calculation - m_output[idx] = m_op_a[rev_idx]; // set index value + } else { // out of place calculation + m_output[m_stride * idx] = m_op_a[m_stride * rev_idx]; // set index value } } } @@ -212,7 +283,47 @@ class VectorOpTask : public TaskBase void slice() { for (uint64_t i = 0; i < m_nof_operations; ++i) { - m_output[i] = m_op_a[i * m_stride]; + m_output[i * m_stride_out] = m_op_a[i * m_stride]; + } + } + + // Function to perform modulus with Mersenne number + uint64_t mersenne_mod(uint64_t shifted_idx, uint32_t total_bits) + { + uint64_t mod = (1ULL << total_bits) - 1; + shifted_idx = (shifted_idx & mod) + (shifted_idx >> total_bits); + while (shifted_idx >= mod) { + shifted_idx = (shifted_idx & mod) + (shifted_idx >> total_bits); + } + return shifted_idx; + } + + // Single worker functionality to execute replace elements + void replace_elements() + { + const uint32_t total_bits = m_log_nof_rows + m_log_nof_cols; + for (uint32_t i = 0; i < m_nof_operations; ++i) { + uint64_t start_idx = (*m_start_indices_in_mat)[m_start_index + i]; + uint64_t idx = start_idx; + T prev = m_op_a[m_stride * idx]; + do { + uint64_t shifted_idx = idx << m_log_nof_rows; + uint64_t new_idx = mersenne_mod(shifted_idx, total_bits); + T next = m_op_a[m_stride * new_idx]; + m_output[m_stride * new_idx] = prev; + prev = next; + idx = new_idx; + } while (idx != start_idx); + } + } + + // Single worker functionality for out of place matrix transpose + void out_of_place_transpose() + { + for (uint32_t k = 0; k < m_nof_operations; ++k) { + for (uint32_t j = 0; j < m_nof_cols; ++j) { + m_output[m_stride * (j * m_nof_rows + k)] = m_op_a[m_stride * (k * m_nof_cols + j)]; + } } } @@ -223,27 +334,41 @@ class VectorOpTask : public TaskBase &VectorOpTask::vector_sub, // VECTOR_SUB, &VectorOpTask::vector_mul, // VECTOR_MUL, &VectorOpTask::vector_div, // VECTOR_DIV, + &VectorOpTask::convert_to_montgomery, // CONVERT_TO_MONTGOMERY, + &VectorOpTask::convert_from_montgomery, // CONVERT_FROM_MONTGOMERY, &VectorOpTask::vector_sum, // VECTOR_SUM &VectorOpTask::vector_product, // VECTOR_PRODUCT &VectorOpTask::scalar_add_vec, // SCALAR_ADD_VEC, &VectorOpTask::scalar_sub_vec, // SCALAR_SUB_VEC, &VectorOpTask::scalar_mul_vec, // SCALAR_MUL_VEC, - &VectorOpTask::convert_to_montgomery, // CONVERT_TO_MONTGOMERY, - &VectorOpTask::convert_from_montgomery, // CONVERT_FROM_MONTGOMERY, &VectorOpTask::bit_reverse, // BIT_REVERSE - &VectorOpTask::slice // SLICE + &VectorOpTask::slice, // SLICE + &VectorOpTask::replace_elements, // REPLACE_ELEMENTS + &VectorOpTask::out_of_place_transpose // OUT_OF_PLACE_MATRIX_TRANSPOSE + }; - VecOperation m_operation; // the operation to execute - int m_nof_operations; // number of operations to execute for this task - const T* m_op_a; // pointer to operand A. Operand A is a vector. - const T* m_op_b; // pointer to operand B. Operand B is a vector or scalar - uint64_t m_start_index; // index used in bitreverse - int m_bit_size; // use in bitrev operation - uint64_t m_stride; // used in slice operation - T* m_output; // pointer to the output. Can be a vector or scalar pointer - T m_intermidiate_res; // pointer to the output. Can be a vector or scalar pointer -}; + VecOperation m_operation; // the operation to execute + uint32_t m_nof_operations; // number of operations to execute for this task + const T* m_op_a; // pointer to operand A. Operand A is a vector, or matrix in case of replace_elements + const T* m_op_b; // pointer to operand B. Operand B is a vector or scalar + uint64_t m_start_index; // index used in bitreverse operation and out of place matrix transpose + uint64_t m_stop_index; // index used in reduce operations and out of place matrix transpose + uint32_t m_bit_size; // use in bitrev operation + uint64_t m_stride; // used to support column batch operations + uint64_t m_stride_out; // used in slice operation + T* + m_output; // pointer to the output. Can be a vector, scalar pointer, or a matrix pointer in case of replace_elements + uint32_t m_log_nof_rows; // log of the number of rows in the matrix, used in replace_elements + uint32_t m_log_nof_cols; // log of the number of columns in the matrix, used in replace_elements + uint32_t m_nof_rows; // the number of rows in the matrix, used in out of place matrix transpose + uint32_t m_nof_cols; // the number of columns in the matrix, used in out of place matrix transpose + const std::vector* m_start_indices_in_mat; // Indices used in replace_elements operations + +public: + T m_intermidiate_res; // pointer to the output. Can be a vector or scalar pointer + uint64_t m_idx_in_batch; // index in the batch. Used in intermediate res tasks +}; // class VectorOpTask #define NOF_OPERATIONS_PER_TASK 512 #define CONFIG_NOF_THREADS_KEY "n_threads" @@ -260,12 +385,14 @@ int get_nof_workers(const VecOpsConfig& config) // Execute a full task from the type vector = vector (op) vector template eIcicleError -cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output) +cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { - TasksManager> task_manager(get_nof_workers(config)); - for (uint64_t i = 0; i < n; i += NOF_OPERATIONS_PER_TASK) { + TasksManager> task_manager(get_nof_workers(config) - 1); + const uint64_t total_nof_operations = size * config.batch_size; + for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) { VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); - task_p->send_2ops_task(op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - i), vec_a + i, vec_b + i, output + i); + task_p->send_2ops_task( + op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), vec_a + i, vec_b + i, 1, output + i); } task_manager.wait_done(); return eIcicleError::SUCCESS; @@ -274,12 +401,19 @@ cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t n, con // Execute a full task from the type vector = scalar (op) vector template eIcicleError cpu_scalar_vector_op( - VecOperation op, const T* scalar_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output) + VecOperation op, const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { - TasksManager> task_manager(get_nof_workers(config)); - for (uint64_t i = 0; i < n; i += NOF_OPERATIONS_PER_TASK) { - VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); - task_p->send_2ops_task(op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - i), scalar_a, vec_b + i, output + i); + TasksManager> task_manager(get_nof_workers(config) - 1); + const uint64_t total_nof_operations = size; + const uint32_t stride = config.columns_batch ? config.batch_size : 1; + for (uint32_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { + for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) { + VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); + task_p->send_2ops_task( + op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), scalar_a + idx_in_batch, + config.columns_batch ? vec_b + idx_in_batch + i * config.batch_size : vec_b + idx_in_batch * size + i, stride, + config.columns_batch ? output + idx_in_batch + i * config.batch_size : output + idx_in_batch * size + i); + } } task_manager.wait_done(); return eIcicleError::SUCCESS; @@ -287,11 +421,12 @@ eIcicleError cpu_scalar_vector_op( /////////////////////////////////////////////////////// // Functions to register at the CPU backend +/*********************************** ADD ***********************************/ template -eIcicleError -cpu_vector_add(const Device& device, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output) +eIcicleError cpu_vector_add( + const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { - return cpu_2vectors_op(VecOperation::VECTOR_ADD, vec_a, vec_b, n, config, output); + return cpu_2vectors_op(VecOperation::VECTOR_ADD, vec_a, vec_b, size, config, output); } REGISTER_VECTOR_ADD_BACKEND("CPU", cpu_vector_add); @@ -299,113 +434,149 @@ REGISTER_VECTOR_ADD_BACKEND("CPU", cpu_vector_add); /*********************************** ACCUMULATE ***********************************/ template eIcicleError -cpu_vector_accumulate(const Device& device, T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config) +cpu_vector_accumulate(const Device& device, T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config) { - for (uint64_t i = 0; i < n; ++i) { - vec_a[i] = vec_a[i] + vec_b[i]; - } - return eIcicleError::SUCCESS; + return cpu_2vectors_op(VecOperation::VECTOR_ADD, vec_a, vec_b, size, config, vec_a); } REGISTER_VECTOR_ACCUMULATE_BACKEND("CPU", cpu_vector_accumulate); /*********************************** SUB ***********************************/ template -eIcicleError -cpu_vector_sub(const Device& device, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output) +eIcicleError cpu_vector_sub( + const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { - return cpu_2vectors_op(VecOperation::VECTOR_SUB, vec_a, vec_b, n, config, output); + return cpu_2vectors_op(VecOperation::VECTOR_SUB, vec_a, vec_b, size, config, output); } REGISTER_VECTOR_SUB_BACKEND("CPU", cpu_vector_sub); /*********************************** MUL ***********************************/ template -eIcicleError -cpu_vector_mul(const Device& device, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output) +eIcicleError cpu_vector_mul( + const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { - return cpu_2vectors_op(VecOperation::VECTOR_MUL, vec_a, vec_b, n, config, output); + return cpu_2vectors_op(VecOperation::VECTOR_MUL, vec_a, vec_b, size, config, output); } REGISTER_VECTOR_MUL_BACKEND("CPU", cpu_vector_mul); /*********************************** DIV ***********************************/ template -eIcicleError -cpu_vector_div(const Device& device, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output) +eIcicleError cpu_vector_div( + const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { - return cpu_2vectors_op(VecOperation::VECTOR_DIV, vec_a, vec_b, n, config, output); + return cpu_2vectors_op(VecOperation::VECTOR_DIV, vec_a, vec_b, size, config, output); } REGISTER_VECTOR_DIV_BACKEND("CPU", cpu_vector_div); -/*********************************** SUM ***********************************/ +/*********************************** CONVERT MONTGOMERY ***********************************/ template -eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output) +eIcicleError cpu_convert_montgomery( + const Device& device, const T* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, T* output) { - TasksManager> task_manager(get_nof_workers(config)); - bool output_initialized = false; - uint64_t vec_s_offset = 0; - VectorOpTask* task_p; - // run until all vector deployed and all tasks completed - do { - task_p = vec_s_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task(); - if (task_p->is_completed()) { - *output = output_initialized ? task_p->m_intermidiate_res : *output + task_p->m_intermidiate_res; - } - if (vec_s_offset < n) { - task_p->send_intermidiate_res_task( - VecOperation::VECTOR_SUM, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_s_offset), vec_a + vec_s_offset); - vec_s_offset += NOF_OPERATIONS_PER_TASK; - } - } while (task_p != nullptr); + TasksManager> task_manager(get_nof_workers(config) - 1); + const uint64_t total_nof_operations = size * config.batch_size; + for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) { + VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); + task_p->send_1op_task( + (is_to_montgomery ? CONVERT_TO_MONTGOMERY : CONVERT_FROM_MONTGOMERY), + std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), input + i, output + i); + } + task_manager.wait_done(); + for (uint64_t i = 0; i < size * config.batch_size; i++) {} return eIcicleError::SUCCESS; } -// Once backend will support - uncomment the following line -// REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum); +REGISTER_CONVERT_MONTGOMERY_BACKEND("CPU", cpu_convert_montgomery); + /*********************************** SUM ***********************************/ + template -eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output) +eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output) { - TasksManager> task_manager(get_nof_workers(config)); - bool output_initialized = false; - uint64_t vec_s_offset = 0; - VectorOpTask* task_p; + TasksManager> task_manager(get_nof_workers(config) - 1); + std::vector output_initialized = std::vector(config.batch_size, false); + uint64_t vec_a_offset = 0; + uint64_t idx_in_batch = 0; // run until all vector deployed and all tasks completed - do { - task_p = vec_s_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task(); + while (true) { + VectorOpTask* task_p = + vec_a_offset < size ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task(); + if (task_p == nullptr) { return eIcicleError::SUCCESS; } if (task_p->is_completed()) { - *output = output_initialized ? task_p->m_intermidiate_res : *output * task_p->m_intermidiate_res; + output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] + ? output[task_p->m_idx_in_batch] + task_p->m_intermidiate_res + : task_p->m_intermidiate_res; + output_initialized[task_p->m_idx_in_batch] = true; } - if (vec_s_offset < n) { + if (vec_a_offset < size) { + task_p->m_idx_in_batch = idx_in_batch; task_p->send_intermidiate_res_task( - VecOperation::VECTOR_SUM, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_s_offset), vec_a + vec_s_offset); - vec_s_offset += NOF_OPERATIONS_PER_TASK; + VecOperation::VECTOR_SUM, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - vec_a_offset), + config.columns_batch ? vec_a + idx_in_batch + vec_a_offset * config.batch_size + : vec_a + idx_in_batch * size + vec_a_offset, + config.columns_batch ? config.batch_size : 1); + idx_in_batch++; + if (idx_in_batch == config.batch_size) { + vec_a_offset += NOF_OPERATIONS_PER_TASK; + idx_in_batch = 0; + } + } else { + task_p->set_idle(); } - } while (task_p != nullptr); - return eIcicleError::SUCCESS; + } } -// Once backend will support - uncomment the following line -// REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum); +REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum); -/*********************************** MUL BY SCALAR***********************************/ +/*********************************** PRODUCT ***********************************/ template -eIcicleError cpu_scalar_mul( - const Device& device, const T* scalar_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output) +eIcicleError +cpu_vector_product(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output) { - return cpu_scalar_vector_op(VecOperation::SCALAR_MUL_VEC, scalar_a, vec_b, n, config, output); + TasksManager> task_manager(get_nof_workers(config) - 1); + std::vector output_initialized = std::vector(config.batch_size, false); + uint64_t vec_a_offset = 0; + uint64_t idx_in_batch = 0; + // run until all vector deployed and all tasks completed + while (true) { + VectorOpTask* task_p = + vec_a_offset < size ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task(); + if (task_p == nullptr) { return eIcicleError::SUCCESS; } + if (task_p->is_completed()) { + output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] + ? output[task_p->m_idx_in_batch] * task_p->m_intermidiate_res + : task_p->m_intermidiate_res; + output_initialized[task_p->m_idx_in_batch] = true; + } + if (vec_a_offset < size) { + task_p->m_idx_in_batch = idx_in_batch; + task_p->send_intermidiate_res_task( + VecOperation::VECTOR_PRODUCT, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - vec_a_offset), + config.columns_batch ? vec_a + idx_in_batch + vec_a_offset * config.batch_size + : vec_a + idx_in_batch * size + vec_a_offset, + config.columns_batch ? config.batch_size : 1); + idx_in_batch++; + if (idx_in_batch == config.batch_size) { + vec_a_offset += NOF_OPERATIONS_PER_TASK; + idx_in_batch = 0; + } + } else { + task_p->set_idle(); + } + } } -REGISTER_SCALAR_MUL_VEC_BACKEND("CPU", cpu_scalar_mul); +REGISTER_VECTOR_PRODUCT_BACKEND("CPU", cpu_vector_product); /*********************************** Scalar + Vector***********************************/ template eIcicleError cpu_scalar_add( - const Device& device, const T* scalar_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output) + const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { - return cpu_scalar_vector_op(VecOperation::SCALAR_ADD_VEC, scalar_a, vec_b, n, config, output); + return cpu_scalar_vector_op(VecOperation::SCALAR_ADD_VEC, scalar_a, vec_b, size, config, output); } REGISTER_SCALAR_ADD_VEC_BACKEND("CPU", cpu_scalar_add); @@ -413,57 +584,149 @@ REGISTER_SCALAR_ADD_VEC_BACKEND("CPU", cpu_scalar_add); /*********************************** Scalar - Vector***********************************/ template eIcicleError cpu_scalar_sub( - const Device& device, const T* scalar_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output) + const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { - return cpu_scalar_vector_op(VecOperation::SCALAR_SUB_VEC, scalar_a, vec_b, n, config, output); + return cpu_scalar_vector_op(VecOperation::SCALAR_SUB_VEC, scalar_a, vec_b, size, config, output); } REGISTER_SCALAR_SUB_VEC_BACKEND("CPU", cpu_scalar_sub); -/*********************************** CONVERT MONTGOMERY ***********************************/ +/*********************************** MUL BY SCALAR***********************************/ template -eIcicleError cpu_convert_montgomery( - const Device& device, const T* input, uint64_t n, bool is_into, const VecOpsConfig& config, T* output) +eIcicleError cpu_scalar_mul( + const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { - TasksManager> task_manager(get_nof_workers(config)); - for (uint64_t i = 0; i < n; i += NOF_OPERATIONS_PER_TASK) { - VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); - task_p->send_1op_task( - is_into ? CONVERT_TO_MONTGOMERY : CONVERT_FROM_MONTGOMERY, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - i), - input + i, output + i); - } - task_manager.wait_done(); - return eIcicleError::SUCCESS; + return cpu_scalar_vector_op(VecOperation::SCALAR_MUL_VEC, scalar_a, vec_b, size, config, output); } -REGISTER_CONVERT_MONTGOMERY_BACKEND("CPU", cpu_convert_montgomery); - -#ifdef EXT_FIELD -REGISTER_VECTOR_ADD_EXT_FIELD_BACKEND("CPU", cpu_vector_add); -REGISTER_VECTOR_ACCUMULATE_EXT_FIELD_BACKEND("CPU", cpu_vector_accumulate); -REGISTER_VECTOR_SUB_EXT_FIELD_BACKEND("CPU", cpu_vector_sub); -REGISTER_VECTOR_MUL_EXT_FIELD_BACKEND("CPU", cpu_vector_mul); -REGISTER_CONVERT_MONTGOMERY_EXT_FIELD_BACKEND("CPU", cpu_convert_montgomery); -#endif // EXT_FIELD +REGISTER_SCALAR_MUL_VEC_BACKEND("CPU", cpu_scalar_mul); /*********************************** TRANSPOSE ***********************************/ + template -eIcicleError cpu_matrix_transpose( +eIcicleError out_of_place_matrix_transpose( const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out) { - // Check for invalid arguments - if (!mat_in || !mat_out || nof_rows == 0 || nof_cols == 0) { return eIcicleError::INVALID_ARGUMENT; } + TasksManager> task_manager(get_nof_workers(config) - 1); + uint32_t stride = config.columns_batch ? config.batch_size : 1; + const uint64_t total_elements_one_mat = static_cast(nof_rows) * nof_cols; + const uint32_t NOF_ROWS_PER_TASK = + std::min((uint64_t)nof_rows, std::max((uint64_t)(NOF_OPERATIONS_PER_TASK / nof_cols), (uint64_t)1)); + for (uint32_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { + const T* cur_mat_in = config.columns_batch ? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements_one_mat; + T* cur_mat_out = config.columns_batch ? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements_one_mat; + // Perform the matrix transpose + for (uint32_t i = 0; i < nof_rows; i += NOF_ROWS_PER_TASK) { + VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); + task_p->send_out_of_place_matrix_transpose_task( + OUT_OF_PLACE_MATRIX_TRANSPOSE, cur_mat_in + stride * i * nof_cols, + std::min((uint64_t)NOF_ROWS_PER_TASK, (uint64_t)nof_rows - i), nof_rows, nof_cols, stride, + cur_mat_out + (stride * i)); + } + } + task_manager.wait_done(); + return eIcicleError::SUCCESS; +} - // Perform the matrix transpose - for (uint32_t i = 0; i < nof_rows; ++i) { - for (uint32_t j = 0; j < nof_cols; ++j) { - mat_out[j * nof_rows + i] = mat_in[i * nof_cols + j]; +uint32_t gcd(uint32_t a, uint32_t b) +{ + while (b != 0) { + uint32_t temp = b; + b = a % b; + a = temp; + } + return a; +} + +// Recursive function to generate all k-ary necklaces and to replace the elements within the necklaces +template +void gen_necklace( + uint32_t t, + uint32_t p, + uint32_t k, + uint32_t length, + std::vector& necklace, + std::vector& task_indices) +{ + if (t > length) { + if ( + length % p == 0 && + !std::all_of(necklace.begin() + 1, necklace.begin() + length + 1, [first_element = necklace[1]](uint32_t x) { + return x == first_element; + })) { + uint32_t start_idx = 0; + uint64_t multiplier = 1; + for (int i = length; i >= 1; --i) { // Compute start_idx as the decimal representation of the necklace + start_idx += necklace[i] * multiplier; + multiplier *= k; + } + task_indices.push_back(start_idx); } + return; } + necklace[t] = necklace[t - p]; + gen_necklace(t + 1, p, k, length, necklace, task_indices); + + for (int i = necklace[t - p] + 1; i < k; ++i) { + necklace[t] = i; + gen_necklace(t + 1, t, k, length, necklace, task_indices); + } +} + +template +eIcicleError matrix_transpose_necklaces( + const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out) +{ + uint32_t log_nof_rows = static_cast(std::floor(std::log2(nof_rows))); + uint32_t log_nof_cols = static_cast(std::floor(std::log2(nof_cols))); + uint32_t gcd_value = gcd(log_nof_rows, log_nof_cols); + uint32_t k = 1 << gcd_value; // Base of necklaces + uint32_t length = + (log_nof_cols + log_nof_rows) / gcd_value; // length of necklaces. Since all are powers of 2, equivalent to + // (log_nof_cols + log_nof_rows) / gcd_value; + const uint64_t max_nof_operations = NOF_OPERATIONS_PER_TASK / length; + const uint64_t total_elements_one_mat = static_cast(nof_rows) * nof_cols; + + std::vector necklace(length + 1, 0); + std::vector start_indices_in_mat; // Collect start indices + gen_necklace(1, 1, k, length, necklace, start_indices_in_mat); + + TasksManager> task_manager(get_nof_workers(config) - 1); + for (uint64_t i = 0; i < start_indices_in_mat.size(); i += max_nof_operations) { + uint64_t nof_operations = std::min((uint64_t)max_nof_operations, start_indices_in_mat.size() - i); + for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { + VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); + task_p->send_replace_elements_task( + REPLACE_ELEMENTS, config.columns_batch ? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements_one_mat, + nof_operations, start_indices_in_mat, i, log_nof_rows, log_nof_cols, + config.columns_batch ? config.batch_size : 1, + config.columns_batch ? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements_one_mat); + } + } + task_manager.wait_done(); return eIcicleError::SUCCESS; } +template +eIcicleError cpu_matrix_transpose( + const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out) +{ + ICICLE_ASSERT(mat_in && mat_out && nof_rows != 0 && nof_cols != 0) << "Invalid argument"; + + // check if the number of rows and columns are powers of 2, if not use the basic transpose + bool is_power_of_2 = (nof_rows & (nof_rows - 1)) == 0 && (nof_cols & (nof_cols - 1)) == 0; + bool is_inplace = mat_in == mat_out; + if (!is_inplace) { + return (out_of_place_matrix_transpose(device, mat_in, nof_rows, nof_cols, config, mat_out)); + } else if (is_power_of_2) { + return (matrix_transpose_necklaces(mat_in, nof_rows, nof_cols, config, mat_out)); + } else { + ICICLE_LOG_ERROR << "Matrix transpose is not supported for inplace non power of 2 rows and columns"; + return eIcicleError::INVALID_ARGUMENT; + } +} + REGISTER_MATRIX_TRANSPOSE_BACKEND("CPU", cpu_matrix_transpose); #ifdef EXT_FIELD REGISTER_MATRIX_TRANSPOSE_EXT_FIELD_BACKEND("CPU", cpu_matrix_transpose); @@ -474,21 +737,23 @@ template eIcicleError cpu_bit_reverse(const Device& device, const T* vec_in, uint64_t size, const VecOpsConfig& config, T* vec_out) { - // Check for invalid arguments - if (!vec_in || !vec_out || size == 0) { return eIcicleError::INVALID_ARGUMENT; } + ICICLE_ASSERT(vec_in && vec_out && size != 0) << "Invalid argument"; - // Calculate log2(size) - int logn = static_cast(std::floor(std::log2(size))); - if ((1ULL << logn) != size) { - return eIcicleError::INVALID_ARGUMENT; // Ensure size is a power of 2 - } + uint32_t logn = static_cast(std::floor(std::log2(size))); + ICICLE_ASSERT((1ULL << logn) == size) << "Invalid argument - size is not a power of 2"; // Perform the bit reverse - TasksManager> task_manager(get_nof_workers(config)); - for (uint64_t i = 0; i < size; i += NOF_OPERATIONS_PER_TASK) { - VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); - task_p->send_bitrev_task( - BIT_REVERSE, logn, i, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - i), vec_in, vec_out); + TasksManager> task_manager(get_nof_workers(config) - 1); + for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { + for (uint64_t i = 0; i < size; i += NOF_OPERATIONS_PER_TASK) { + VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); + + task_p->send_bit_reverse_task( + BIT_REVERSE, logn, i, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - i), + config.columns_batch ? vec_in + idx_in_batch : vec_in + idx_in_batch * size, + config.columns_batch ? config.batch_size : 1, + config.columns_batch ? vec_out + idx_in_batch : vec_out + idx_in_batch * size); + } } task_manager.wait_done(); return eIcicleError::SUCCESS; @@ -507,20 +772,25 @@ eIcicleError cpu_slice( const T* vec_in, uint64_t offset, uint64_t stride, - uint64_t size, + uint64_t size_in, + uint64_t size_out, const VecOpsConfig& config, T* vec_out) { - if (vec_in == nullptr || vec_out == nullptr) { - ICICLE_LOG_ERROR << "Error: Invalid argument - input or output vector is null"; - return eIcicleError::INVALID_ARGUMENT; - } - - TasksManager> task_manager(get_nof_workers(config)); - for (uint64_t i = 0; i < size; i += NOF_OPERATIONS_PER_TASK) { - VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); - task_p->send_slice_task( - SLICE, stride, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - i), vec_in + offset + i * stride, vec_out + i); + ICICLE_ASSERT(vec_in != nullptr && vec_out != nullptr) << "Error: Invalid argument - input or output vector is null"; + ICICLE_ASSERT(offset + (size_out - 1) * stride < size_in) << "Error: Invalid argument - slice out of bound"; + + TasksManager> task_manager(get_nof_workers(config) - 1); + for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { + for (uint64_t i = 0; i < size_out; i += NOF_OPERATIONS_PER_TASK) { + VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); + task_p->send_slice_task( + SLICE, config.columns_batch ? stride * config.batch_size : stride, config.columns_batch ? config.batch_size : 1, + std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size_out - i), + config.columns_batch ? vec_in + idx_in_batch + (offset + i * stride) * config.batch_size + : vec_in + idx_in_batch * size_in + offset + i * stride, + config.columns_batch ? vec_out + idx_in_batch + i * config.batch_size : vec_out + idx_in_batch * size_out + i); + } } task_manager.wait_done(); return eIcicleError::SUCCESS; @@ -531,6 +801,29 @@ REGISTER_SLICE_BACKEND("CPU", cpu_slice); REGISTER_SLICE_EXT_FIELD_BACKEND("CPU", cpu_slice); #endif // EXT_FIELD +/*********************************** Highest non-zero idx ***********************************/ +template +eIcicleError cpu_highest_non_zero_idx( + const Device& device, const T* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx /*OUT*/) +{ + ICICLE_ASSERT(input && out_idx && size != 0) << "Error: Invalid argument"; + uint64_t stride = config.columns_batch ? config.batch_size : 1; + for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) { + out_idx[idx_in_batch] = -1; // zero vector is considered '-1' since 0 would be zero in vec[0] + const T* curr_input = + config.columns_batch ? input + idx_in_batch : input + idx_in_batch * size; // Pointer to the current vector + for (int64_t i = size - 1; i >= 0; --i) { + if (curr_input[i * stride] != T::zero()) { + out_idx[idx_in_batch] = i; + break; + } + } + } + return eIcicleError::SUCCESS; +} + +REGISTER_HIGHEST_NON_ZERO_IDX_BACKEND("CPU", cpu_highest_non_zero_idx); + /*********************************** Polynomial evaluation ***********************************/ template @@ -543,12 +836,19 @@ eIcicleError cpu_poly_eval( const VecOpsConfig& config, T* evals /*OUT*/) { + ICICLE_ASSERT(coeffs && domain && evals && coeffs_size != 0 && domain_size != 0) << "Error: Invalid argument"; // using Horner's method // example: ax^2+bx+c is computed as (1) r=a, (2) r=r*x+b, (3) r=r*x+c - for (uint64_t eval_idx = 0; eval_idx < domain_size; ++eval_idx) { - evals[eval_idx] = coeffs[coeffs_size - 1]; - for (int64_t coeff_idx = coeffs_size - 2; coeff_idx >= 0; --coeff_idx) { - evals[eval_idx] = evals[eval_idx] * domain[eval_idx] + coeffs[coeff_idx]; + uint64_t stride = config.columns_batch ? config.batch_size : 1; + for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) { + const T* curr_coeffs = config.columns_batch ? coeffs + idx_in_batch : coeffs + idx_in_batch * coeffs_size; + T* curr_evals = config.columns_batch ? evals + idx_in_batch : evals + idx_in_batch * domain_size; + for (uint64_t eval_idx = 0; eval_idx < domain_size; ++eval_idx) { + curr_evals[eval_idx * stride] = curr_coeffs[(coeffs_size - 1) * stride]; + for (int64_t coeff_idx = coeffs_size - 2; coeff_idx >= 0; --coeff_idx) { + curr_evals[eval_idx * stride] = + curr_evals[eval_idx * stride] * domain[eval_idx] + curr_coeffs[coeff_idx * stride]; + } } } return eIcicleError::SUCCESS; @@ -556,38 +856,21 @@ eIcicleError cpu_poly_eval( REGISTER_POLYNOMIAL_EVAL("CPU", cpu_poly_eval); -/*********************************** Highest non-zero idx ***********************************/ -template -eIcicleError cpu_highest_non_zero_idx( - const Device& device, const T* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx /*OUT*/) -{ - *out_idx = -1; // zero vector is considered '-1' since 0 would be zero in vec[0] - for (int64_t i = size - 1; i >= 0; --i) { - if (input[i] != T::zero()) { - *out_idx = i; - break; - } - } - return eIcicleError::SUCCESS; -} - -REGISTER_HIGHEST_NON_ZERO_IDX_BACKEND("CPU", cpu_highest_non_zero_idx); - /*============================== polynomial division ==============================*/ template -void school_book_division_step_cpu(T* r, T* q, const T* b, int deg_r, int deg_b, const T& lc_b_inv) +void school_book_division_step_cpu(T* r, T* q, const T* b, int deg_r, int deg_b, const T& lc_b_inv, uint32_t stride = 1) { int64_t monomial = deg_r - deg_b; // monomial=1 is 'x', monomial=2 is x^2 etc. - T lc_r = r[deg_r]; + T lc_r = r[deg_r * stride]; // leading coefficient of r T monomial_coeff = lc_r * lc_b_inv; // lc_r / lc_b // adding monomial s to q (q=q+s) - q[monomial] = monomial_coeff; + q[monomial * stride] = monomial_coeff; for (int i = monomial; i <= deg_r; ++i) { - T b_coeff = b[i - monomial]; - r[i] = r[i] - monomial_coeff * b_coeff; + T b_coeff = b[(i - monomial) * stride]; + r[i * stride] = r[i * stride] - monomial_coeff * b_coeff; } } @@ -595,36 +878,65 @@ template eIcicleError cpu_poly_divide( const Device& device, const T* numerator, - int64_t numerator_deg, - const T* denumerator, - int64_t denumerator_deg, + uint64_t numerator_size, + const T* denominator, + uint64_t denominator_size, const VecOpsConfig& config, T* q_out /*OUT*/, uint64_t q_size, T* r_out /*OUT*/, uint64_t r_size) { - ICICLE_ASSERT(r_size >= numerator_deg) - << "polynomial division expects r(x) size to be similar to numerator size and higher than numerator degree(x)"; - ICICLE_ASSERT(q_size >= (numerator_deg - denumerator_deg + 1)) - << "polynomial division expects q(x) size to be at least deg(numerator)-deg(denumerator)+1"; - - ICICLE_CHECK(icicle_copy_async(r_out, numerator, r_size * sizeof(T), config.stream)); - - // invert largest coeff of b - const T& lc_b_inv = T::inverse(denumerator[denumerator_deg]); - - int64_t deg_r = numerator_deg; - while (deg_r >= denumerator_deg) { - // each iteration is removing the largest monomial in r until deg(r)= numerator_deg + 1) + << "polynomial division expects r(x) size to be similar to numerator size and higher than numerator degree(x)"; + ICICLE_ASSERT(q_size >= (numerator_deg - denominator_deg + 1)) + << "polynomial division expects q(x) size to be at least deg(numerator)-deg(denominator)+1"; + + memset(curr_r_out, 0, sizeof(T) * r_size); + memcpy(curr_r_out, curr_numerator, sizeof(T) * (numerator_deg + 1)); + + // invert largest coeff of b + const T& lc_b_inv = T::inverse(curr_denominator[denominator_deg * stride]); + int64_t deg_r = numerator_deg; + while (deg_r >= denominator_deg) { + // each iteration is removing the largest monomial in r until deg(r)); \ No newline at end of file +REGISTER_POLYNOMIAL_DIVISION("CPU", cpu_poly_divide); + +#ifdef EXT_FIELD +REGISTER_VECTOR_ADD_EXT_FIELD_BACKEND("CPU", cpu_vector_add); +REGISTER_VECTOR_ACCUMULATE_EXT_FIELD_BACKEND("CPU", cpu_vector_accumulate); +REGISTER_VECTOR_SUB_EXT_FIELD_BACKEND("CPU", cpu_vector_sub); +REGISTER_VECTOR_MUL_EXT_FIELD_BACKEND("CPU", cpu_vector_mul); +REGISTER_VECTOR_DIV_EXT_FIELD_BACKEND("CPU", cpu_vector_div); +REGISTER_CONVERT_MONTGOMERY_EXT_FIELD_BACKEND("CPU", cpu_convert_montgomery); +REGISTER_VECTOR_SUM_EXT_FIELD_BACKEND("CPU", cpu_vector_sum); +REGISTER_VECTOR_PRODUCT_EXT_FIELD_BACKEND("CPU", cpu_vector_product); +REGISTER_SCALAR_MUL_VEC_EXT_FIELD_BACKEND("CPU", cpu_scalar_mul); +REGISTER_SCALAR_ADD_VEC_EXT_FIELD_BACKEND("CPU", cpu_scalar_add); +REGISTER_SCALAR_SUB_VEC_EXT_FIELD_BACKEND("CPU", cpu_scalar_sub); +#endif // EXT_FIELD \ No newline at end of file diff --git a/icicle/include/icicle/backend/vec_ops_backend.h b/icicle/include/icicle/backend/vec_ops_backend.h index 8ee0c0a15..3739fb780 100644 --- a/icicle/include/icicle/backend/vec_ops_backend.h +++ b/icicle/include/icicle/backend/vec_ops_backend.h @@ -7,16 +7,72 @@ using namespace field_config; namespace icicle { /*************************** Backend registration ***************************/ + using vectorVectorOpImplInplaceA = std::function; + + using scalarConvertMontgomeryImpl = std::function; + + using VectorReduceOpImpl = std::function; + using scalarVectorOpImpl = std::function; - using scalarVectorOpImplInplaceA = std::function; + using scalarMatrixOpImpl = std::function; + + using scalarBitReverseOpImpl = std::function; + + using scalarSliceOpImpl = std::function; + + using scalarHighNonZeroIdxOpImpl = std::function; + + using scalarPolyEvalImpl = std::function; + + using scalarPolyDivImpl = std::function; void register_vector_add(const std::string& deviceType, scalarVectorOpImpl impl); @@ -28,7 +84,7 @@ namespace icicle { }(); \ } - void register_vector_accumulate(const std::string& deviceType, scalarVectorOpImplInplaceA impl); + void register_vector_accumulate(const std::string& deviceType, vectorVectorOpImplInplaceA impl); #define REGISTER_VECTOR_ACCUMULATE_BACKEND(DEVICE_TYPE, FUNC) \ namespace { \ @@ -67,6 +123,36 @@ namespace icicle { }(); \ } + void register_scalar_convert_montgomery(const std::string& deviceType, scalarConvertMontgomeryImpl); + +#define REGISTER_CONVERT_MONTGOMERY_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_scalar_convert_mont) = []() -> bool { \ + register_scalar_convert_montgomery(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + + void register_vector_sum(const std::string& deviceType, VectorReduceOpImpl impl); + +#define REGISTER_VECTOR_SUM_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_vec_sum) = []() -> bool { \ + register_vector_sum(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + + void register_vector_product(const std::string& deviceType, VectorReduceOpImpl impl); + +#define REGISTER_VECTOR_PRODUCT_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_vec_product) = []() -> bool { \ + register_vector_product(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + void register_scalar_mul_vec(const std::string& deviceType, scalarVectorOpImpl impl); #define REGISTER_SCALAR_MUL_VEC_BACKEND(DEVICE_TYPE, FUNC) \ @@ -97,32 +183,6 @@ namespace icicle { }(); \ } - using scalarConvertMontgomeryImpl = std::function; - - void register_scalar_convert_montgomery(const std::string& deviceType, scalarConvertMontgomeryImpl); - -#define REGISTER_CONVERT_MONTGOMERY_BACKEND(DEVICE_TYPE, FUNC) \ - namespace { \ - static bool UNIQUE(_reg_scalar_convert_mont) = []() -> bool { \ - register_scalar_convert_montgomery(DEVICE_TYPE, FUNC); \ - return true; \ - }(); \ - } - - using scalarMatrixOpImpl = std::function; - void register_matrix_transpose(const std::string& deviceType, scalarMatrixOpImpl impl); #define REGISTER_MATRIX_TRANSPOSE_BACKEND(DEVICE_TYPE, FUNC) \ @@ -133,9 +193,6 @@ namespace icicle { }(); \ } - using scalarBitReverseOpImpl = std::function; - void register_scalar_bit_reverse(const std::string& deviceType, scalarBitReverseOpImpl); #define REGISTER_BIT_REVERSE_BACKEND(DEVICE_TYPE, FUNC) \ @@ -146,15 +203,6 @@ namespace icicle { }(); \ } - using scalarSliceOpImpl = std::function; - void register_slice(const std::string& deviceType, scalarSliceOpImpl); #define REGISTER_SLICE_BACKEND(DEVICE_TYPE, FUNC) \ @@ -165,9 +213,6 @@ namespace icicle { }(); \ } - using scalarHighNonZeroIdxOpImpl = std::function; - void register_highest_non_zero_idx(const std::string& deviceType, scalarHighNonZeroIdxOpImpl); #define REGISTER_HIGHEST_NON_ZERO_IDX_BACKEND(DEVICE_TYPE, FUNC) \ @@ -178,24 +223,6 @@ namespace icicle { }(); \ } - template - eIcicleError polynomial_eval( - const T* coeffs, - uint64_t coeffs_size, - const T* domain, - uint64_t domain_size, - const VecOpsConfig& config, - T* evals /*OUT*/); - - using scalarPolyEvalImpl = std::function; - void register_poly_eval(const std::string& deviceType, scalarPolyEvalImpl); #define REGISTER_POLYNOMIAL_EVAL(DEVICE_TYPE, FUNC) \ @@ -206,18 +233,6 @@ namespace icicle { }(); \ } - using scalarPolyDivImpl = std::function; - void register_poly_division(const std::string& deviceType, scalarPolyDivImpl); #define REGISTER_POLYNOMIAL_DIVISION(DEVICE_TYPE, FUNC) \ @@ -233,12 +248,23 @@ namespace icicle { const Device& device, const extension_t* vec_a, const extension_t* vec_b, - uint64_t n, + uint64_t size, const VecOpsConfig& config, extension_t* output)>; using extFieldVectorOpImplInplaceA = std::function; + const Device& device, extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config)>; + + using extFieldVectorReduceOpImpl = std::function; + + using extFieldVectorOpImpl = std::function; void register_extension_vector_add(const std::string& deviceType, extFieldVectorOpImpl impl); @@ -279,11 +305,71 @@ namespace icicle { }(); \ } + void register_extension_vector_div(const std::string& deviceType, extFieldVectorOpImpl impl); + + #define REGISTER_VECTOR_DIV_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_vec_div_ext_field) = []() -> bool { \ + register_extension_vector_div(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + + void register_extension_scalar_mul_vec(const std::string& deviceType, extFieldVectorOpImpl impl); + + #define REGISTER_SCALAR_MUL_VEC_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_scalar_mul_vec_ext_field) = []() -> bool { \ + register_extension_scalar_mul_vec(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + + void register_extension_scalar_add_vec(const std::string& deviceType, extFieldVectorOpImpl impl); + + #define REGISTER_SCALAR_ADD_VEC_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_scalar_add_vec_ext_field) = []() -> bool { \ + register_extension_scalar_add_vec(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + + void register_extension_scalar_sub_vec(const std::string& deviceType, extFieldVectorOpImpl impl); + + #define REGISTER_SCALAR_SUB_VEC_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_scalar_sub_vec_ext_field) = []() -> bool { \ + register_extension_scalar_sub_vec(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + + void register_extension_vector_sum(const std::string& deviceType, extFieldVectorReduceOpImpl impl); + + #define REGISTER_VECTOR_SUM_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_vec_sum_ext_field) = []() -> bool { \ + register_extension_vector_sum(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + + void register_extension_vector_product(const std::string& deviceType, extFieldVectorReduceOpImpl impl); + + #define REGISTER_VECTOR_PRODUCT_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_vec_product_ext_field) = []() -> bool { \ + register_extension_vector_product(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + using extFieldConvertMontgomeryImpl = std::function; @@ -333,7 +419,8 @@ namespace icicle { const extension_t* input, uint64_t offset, uint64_t stride, - uint64_t size, + uint64_t size_in, + uint64_t size_out, const VecOpsConfig& config, extension_t* output)>; diff --git a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h index f0643f978..ef59f816f 100644 --- a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h +++ b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h @@ -65,7 +65,7 @@ namespace icicle { config.is_async = true; config.stream = m_stream; - ICICLE_CHECK(icicle::slice(in_coeffs, offset, stride, out_size, config, out_coeffs)); + ICICLE_CHECK(icicle::slice(in_coeffs, offset, stride, in_size, out_size, config, out_coeffs)); } void add_sub(PolyContext& res, PolyContext a, PolyContext b, bool add1_sub0) @@ -278,7 +278,7 @@ namespace icicle { config.is_result_on_device = true; ICICLE_CHECK(icicle::polynomial_division( - a_coeffs, deg_a, b_coeffs, deg_b, config, Q_coeffs, deg_a - deg_b + 1, R_coeffs, a_N)); + a_coeffs, deg_a + 1, b_coeffs, deg_b + 1, config, Q_coeffs, deg_a - deg_b + 1, R_coeffs, a_N)); } void quotient(PolyContext Q, PolyContext op_a, PolyContext op_b) override @@ -546,8 +546,8 @@ namespace icicle { config.is_result_on_device = true; config.is_async = true; config.stream = m_stream; - ICICLE_CHECK( - icicle::slice(get_context_storage_immutable(p), 0 /*offset*/, stride, domain_size, config, d_evals)); + ICICLE_CHECK(icicle::slice( + get_context_storage_immutable(p), 0 /*offset*/, stride, poly_size, domain_size, config, d_evals)); } else { ICICLE_CHECK(icicle_memset(d_evals, 0, domain_size * sizeof(I))); auto ntt_config = default_ntt_config(); diff --git a/icicle/include/icicle/utils/modifiers.h b/icicle/include/icicle/utils/modifiers.h index a8728d279..b652e9829 100644 --- a/icicle/include/icicle/utils/modifiers.h +++ b/icicle/include/icicle/utils/modifiers.h @@ -33,4 +33,4 @@ #else #define LONG_CONST_SUFFIX(x) x##L #define PACKED(x) x __attribute__((packed)) -#endif \ No newline at end of file +#endif diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h index b23cd0a4b..38551ab6a 100644 --- a/icicle/include/icicle/vec_ops.h +++ b/icicle/include/icicle/vec_ops.h @@ -17,17 +17,22 @@ namespace icicle { * @note APIs with a single input, ignore input b. */ struct VecOpsConfig { - icicleStreamHandle stream; /**< Stream for asynchronous execution. */ - bool is_a_on_device; /**< True if `a` is on the device, false if it is not. Default value: false. */ - bool is_b_on_device; /**< True if `b` is on the device, false if it is not. Default value: false. OPTIONAL. */ - bool is_result_on_device; /**< If true, the output is preserved on the device, otherwise on the host. Default value: - false. */ - bool is_async; /**< Whether to run the vector operations asynchronously. - If set to `true`, the function will be non-blocking and synchronization - must be explicitly managed using `cudaStreamSynchronize` or - `cudaDeviceSynchronize`. If set to `false`, the function will block the current CPU - thread. */ - ConfigExtension* ext = nullptr; /**< Backend-specific extension. */ + icicleStreamHandle stream; /** Stream for asynchronous execution. */ + bool is_a_on_device; /** True if `a` is on the device, false if it is not. Default value: false. */ + bool is_b_on_device; /** True if `b` is on the device, false if it is not. Default value: false. OPTIONAL. */ + bool is_result_on_device; /** If true, the output is preserved on the device, otherwise on the host. Default value: + false. */ + bool is_async; /** Whether to run the vector operations asynchronously. + If set to `true`, the function will be non-blocking and synchronization + must be explicitly managed using `cudaStreamSynchronize` or `cudaDeviceSynchronize`. + If set to `false`, the function will block the current CPU thread. */ + int batch_size; /** Number of vectors (or operations) to process in a batch. + Each vector operation will be performed independently on each batch element. + Default value: 1. */ + bool columns_batch; /** True if the batched vectors are stored as columns in a 2D array (i.e., the vectors are + strided in memory as columns of a matrix). If false, the batched vectors are stored + contiguously in memory (e.g., as rows or in a flat array). Default value: false. */ + ConfigExtension* ext = nullptr; /** Backend-specific extension. */ }; /** @@ -43,6 +48,8 @@ namespace icicle { false, // is_b_on_device false, // is_result_on_device false, // is_async + 1, // batch_size + false, // columns_batch }; return config; } @@ -53,11 +60,17 @@ namespace icicle { * @brief Adds two vectors element-wise. * * @tparam T Type of the elements in the vectors. - * @param vec_a Input vector `a`. - * @param vec_b Input vector `b`. - * @param size Number of elements in the vectors. + * @param vec_a Pointer to the first input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously in memory. + * - If `true`, vectors are stored as columns in a 2D array. + * @param vec_b Pointer to the second input vector(s). + * - The storage layout should match that of `vec_a`. + * @param size Number of elements in each vector. * @param config Configuration for the operation. - * @param output Output vector to store the result. + * @param output Pointer to the output vector(s) where the results will be stored. + * The output array should have the same storage layout as the input vectors. * @return eIcicleError Error code indicating success or failure. */ template @@ -67,24 +80,36 @@ namespace icicle { * @brief Accumulates the elements of two vectors element-wise and stores the result in the first vector. * * @tparam T Type of the elements in the vectors. - * @param vec_a Input/output vector `a`. The result will be written back to this vector. - * @param vec_b Input vector `b`. - * @param size Number of elements in the vectors. + * @param vec_a Pointer to the first Input/output vector(s). The result will be written back to this vector. + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously in memory. + * - If `true`, vectors are stored as columns in a 2D array. + * @param vec_b Pointer to the second input vector(s). + * - The storage layout should match that of `vec_a`. + * @param size Number of elements in each vector. * @param config Configuration for the operation. * @return eIcicleError Error code indicating success or failure. */ template - eIcicleError vector_accumulate(T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config); + eIcicleError + vector_accumulate(T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config); // use vector_add (inplace) /** * @brief Subtracts vector `b` from vector `a` element-wise. * * @tparam T Type of the elements in the vectors. - * @param vec_a Input vector `a`. - * @param vec_b Input vector `b`. - * @param size Number of elements in the vectors. + * @param vec_a Pointer to the first input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously in memory. + * - If `true`, vectors are stored as columns in a 2D array. + * @param vec_b Pointer to the second input vector(s). + * - The storage layout should match that of `vec_a`. + * @param size Number of elements in each vector. * @param config Configuration for the operation. - * @param output Output vector to store the result. + * @param output Pointer to the output vector(s) where the results will be stored. + * The output array should have the same storage layout as the input vectors. * @return eIcicleError Error code indicating success or failure. */ template @@ -94,11 +119,17 @@ namespace icicle { * @brief Multiplies two vectors element-wise. * * @tparam T Type of the elements in the vectors. - * @param vec_a Input vector `a`. - * @param vec_b Input vector `b`. - * @param size Number of elements in the vectors. + * @param vec_a Pointer to the first input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously in memory. + * - If `true`, vectors are stored as columns in a 2D array. + * @param vec_b Pointer to the second input vector(s). + * - The storage layout should match that of `vec_a`. + * @param size Number of elements in each vector. * @param config Configuration for the operation. - * @param output Output vector to store the result. + * @param output Pointer to the output vector(s) where the results will be stored. + * The output array should have the same storage layout as the input vectors. * @return eIcicleError Error code indicating success or failure. */ template @@ -108,11 +139,17 @@ namespace icicle { * @brief Divides vector `a` by vector `b` element-wise. * * @tparam T Type of the elements in the vectors. - * @param vec_a Input vector `a`. - * @param vec_b Input vector `b`. - * @param size Number of elements in the vectors. + * @param vec_a Pointer to the first input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously in memory. + * - If `true`, vectors are stored as columns in a 2D array. + * @param vec_b Pointer to the second input vector(s). + * - The storage layout should match that of `vec_a`. + * @param size Number of elements in each vector. * @param config Configuration for the operation. - * @param output Output vector to store the result. + * @param output Pointer to the output vector(s) where the results will be stored. + * The output array should have the same storage layout as the input vectors. * @return eIcicleError Error code indicating success or failure. */ template @@ -122,15 +159,59 @@ namespace icicle { * @brief Converts elements to and from Montgomery form. * * @tparam T Type of the elements. - * @param input Input vector. - * @param size Number of elements in the input vector. - * @param is_into True to convert into Montgomery form, false to convert out of Montgomery form. + * @param input Pointer to the input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously in memory. + * - If `true`, vectors are stored as columns in a 2D array. + * @param size Number of elements in each vector. + * @param is_to_montgomery True to convert into Montgomery form, false to convert out of Montgomery form. + * @param config Configuration for the operation. + * @param output Pointer to the output vector(s) where the results will be stored. + * The output array should have the same storage layout as the input vectors. + * @return eIcicleError Error code indicating success or failure. + */ + template + eIcicleError + convert_montgomery(const T* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, T* output); + + // Reduction operations + + /** + * @brief Computes the sum of all elements in each vector in a batch. + * + * @tparam T Type of the elements in the vector. + * @param vec_a Pointer to the input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously. + * - If `true`, vectors are stored as columns in a 2D array. + * @param size Number of elements in each vector. + * @param config Configuration for the operation. + * @param output Pointer to the output array where the results will be stored. + * @return eIcicleError Error code indicating success or failure. + */ + + template + eIcicleError vector_sum(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output); + + /** + * @brief Computes the product of all elements in each vector in the batch. + * + * @tparam T Type of the elements in the vectors. + * @param vec_a Pointer to the input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously. + * - If `true`, vectors are stored as columns in a 2D array. + * @param size Number of elements in each vector. * @param config Configuration for the operation. - * @param output Output vector to store the result. + * @param output Pointer to the output array where the results will be stored. * @return eIcicleError Error code indicating success or failure. */ + template - eIcicleError convert_montgomery(const T* input, uint64_t size, bool is_into, const VecOpsConfig& config, T* output); + eIcicleError vector_product(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output); // Scalar-Vector operations @@ -138,12 +219,17 @@ namespace icicle { * @brief Adds a scalar to each element of a vector. * * @tparam T Type of the elements in the vector and the scalar. - * @param scalar_a Input scalar. - * @param vec_b Input vector. - * @param size Number of elements in the vector. + * @param scalar_a Pointer to the input scalar(s). + * @param vec_b Pointer to the input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously. + * - If `true`, vectors are stored as columns in a 2D array. + * @param size Number of elements in a vector. * @param config Configuration for the operation. - * @param output Output vector to store the result. + * @param output Pointer to the output vector(s) where the results will be stored. * @return eIcicleError Error code indicating success or failure. + * @note To subtract a scalar from each element of a vector - use scalar_add_vec with negative scalar. */ template eIcicleError scalar_add_vec(const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output); @@ -152,12 +238,17 @@ namespace icicle { * @brief Subtracts each element of a vector from a scalar, elementwise (res[i]=scalar-vec[i]). * * @tparam T Type of the elements in the vector and the scalar. - * @param scalar_a Input scalar. - * @param vec_b Input vector. - * @param size Number of elements in the vector. + * @param scalar_a Pointer to Input scalar(s). + * @param vec_b Pointer to the input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously. + * - If `true`, vectors are stored as columns in a 2D array. + * @param size Number of elements in a vector. * @param config Configuration for the operation. - * @param output Output vector to store the result. + * @param output Pointer to the output vector(s) where the results will be stored. * @return eIcicleError Error code indicating success or failure. + * @note To subtract a scalar from each element of a vector - use scalar_add_vec with negative scalar. */ template eIcicleError scalar_sub_vec(const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output); @@ -166,11 +257,15 @@ namespace icicle { * @brief Multiplies each element of a vector by a scalar. * * @tparam T Type of the elements in the vector and the scalar. - * @param scalar_a Input scalar. - * @param vec_b Input vector. - * @param size Number of elements in the vector. + * @param scalar_a Pointer to Input scalar(s). + * @param vec_b Pointer to the input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously. + * - If `true`, vectors are stored as columns in a 2D array. + * @param size Number of elements in a vector. * @param config Configuration for the operation. - * @param output Output vector to store the result. + * @param output Pointer to the output vector(s) where the results will be stored. * @return eIcicleError Error code indicating success or failure. */ template @@ -182,12 +277,15 @@ namespace icicle { * @brief Transposes a matrix. * * @tparam T Type of the elements in the matrix. - * @param mat_in Input matrix. - * @param nof_rows Number of rows in the input matrix. - * @param nof_cols Number of columns in the input matrix. + * @param mat_in Pointer to the input matrix or matrices. + * @param nof_rows Number of rows in each input matrix. + * @param nof_cols Number of columns in each input matrix. * @param config Configuration for the operation. - * @param mat_out Output matrix to store the result. + * @param mat_out Pointer to the output matrix or matrices where the transposed matrices will be stored. * @return eIcicleError Error code indicating success or failure. + * @note The input matrices are assumed to be stored in row-major order. + * This function transposes an input matrix or a batch of matrices. + * Matrix transpose inplace is not supported for non-power of 2 rows and columns. */ template eIcicleError @@ -196,42 +294,65 @@ namespace icicle { // Miscellaneous operations /** - * @brief Reorders the vector elements based on bit-reverse. That is out[i]=in[bitrev[i]]. + * @brief Reorders the vector (or batch of vectors) elements based on bit-reverse. That is out[i]=in[bitrev[i]]. * * @tparam T Type of the elements in the vector. - * @param vec_in Input vector. - * @param size Number of elements in the input vector. + * @param vec_in Pointer to the input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously. + * - If `true`, vectors are stored as columns in a 2D array. + * @param size Number of elements in each vector. Must be a power of 2. * @param config Configuration for the operation. - * @param vec_out Output vector to store the result. + * @param vec_out Pointer to the output vector(s) where the results will be stored. + * The output array should have the same storage layout as the input vectors. * @return eIcicleError Error code indicating success or failure. + * @note If `vec_in` and `vec_out` point to the same memory location, the operation is performed in-place. */ template eIcicleError bit_reverse(const T* vec_in, uint64_t size, const VecOpsConfig& config, T* vec_out); /** - * @brief Extracts a slice from a vector. + * @brief Extracts a slice from a vector or batch of vectors. * * @tparam T Type of the elements in the vector. - * @param vec_in Input vector. - * @param offset Offset from which to start the slice. + * @param vec_in Pointer to the input vector(s). + * @param offset Offset from which to start the slice in each vector. * @param stride Stride between elements in the slice. - * @param size Number of elements in the slice. + * @param size_in Number of elements in one input vector. + * @param size_out Number of elements in one input vector. * @param config Configuration for the operation. - * @param vec_out Output vector to store the result. + * @param vec_out Pointer to the output vector(s) where the results will be stored. + * The output array should have the same storage layout as the input vectors. * @return eIcicleError Error code indicating success or failure. + * @note The total input size is `size_in * config.batch_size`. + * The total output size is `size_out * config.batch_size`. + * parameters must satisfy: offset + (size_out-1) * stride < size_in */ template + eIcicleError slice( + const T* vec_in, + uint64_t offset, + uint64_t stride, + uint64_t size_in, + uint64_t size_out, + const VecOpsConfig& config, + T* vec_out); + + // Deprecated slice API + template eIcicleError - slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size, const VecOpsConfig& config, T* vec_out); + slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size_out, const VecOpsConfig& config, T* vec_out); /** - * @brief Finds the highest non-zero index in a vector. + * @brief Finds the highest non-zero index in a vector or batch of vectors. * * @tparam T Type of the elements in the vector. - * @param vec_in Input vector. - * @param size Number of elements in the input vector. + * @param vec_in Pointer to the input vector(s). + * @param size Number of elements in each input vector. * @param config Configuration for the operation. - * @param out_idx Output index of the highest non-zero element. + * @param out_idx Pointer to an array where the output indices of the highest non-zero element in each input vector + * will be stored. The array should have a length of `config.batch_size`. * @return eIcicleError Error code indicating success or failure. */ template @@ -241,12 +362,21 @@ namespace icicle { * @brief Evaluates a polynomial at given domain points. * * @tparam T Type of the elements in the polynomial and domain. - * @param coeffs Pointer to the array of coefficients of the polynomial. - * @param coeffs_size Number of coefficients in the polynomial. - * @param domain Pointer to the array of points at which to evaluate the polynomial. + * @param coeffs Pointer to the array of coefficients of the polynomial(s). + * - The size of `coeffs` should be `coeffs_size * batch_size`. + * - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored + * contiguously. + * - If `config.columns_batch` is `true`, coefficients are interleaved. + * @param coeffs_size Number of coefficients in each polynomial. + * @param domain Pointer to the array of points at which to evaluate the polynomial(s). + * - The same domain is used for all polynomials. + * - The size of `domain` should be `domain_size`. * @param domain_size Number of domain points. * @param config Configuration for the operation. * @param evals Pointer to the array where the evaluated results will be stored. This is an output parameter. + * - The size of `evals` should be `domain_size * batch_size`. + * - If `config.columns_batch` is `false`, results for each polynomial are stored contiguously. + * - If `config.columns_batch` is `true`, results are interleaved. * @return eIcicleError Error code indicating success or failure. */ template @@ -259,26 +389,39 @@ namespace icicle { T* evals /*OUT*/); /** - * @brief Divides two polynomials. + * @brief Divides two polynomials or batch of couples of polynomials. * * @tparam T Type of the elements in the polynomials. - * @param numerator Pointer to the array of coefficients of the numerator polynomial. - * @param numerator_deg Degree of the numerator polynomial. - * @param denominator Pointer to the array of coefficients of the denominator polynomial. - * @param denominator_deg Degree of the denominator polynomial. + * @param numerator Pointer to the array of coefficients of the numerator polynomial(s). + * - The size of `numerator` should be `(numerator_deg + 1) * batch_size`. + * - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored + * contiguously. + * - If `config.columns_batch` is `true`, coefficients are interleaved. + * @param numerator_size size (number of T elements) in numerator vec of a single batch element + * @param denominator Pointer to the array of coefficients of the denominator polynomial(s). + * - Storage layout is similar to `numerator`. + * @param denominator_size size (number of T elements) in denominator vec of a single batch element * @param config Configuration for the operation. - * @param q_out Pointer to the array where the quotient will be stored. This is an output parameter. - * @param q_size Size of the quotient array. - * @param r_out Pointer to the array where the remainder will be stored. This is an output parameter. + * @param q_out Pointer to the array where the quotient polynomial(s) will be stored. This is an output parameter. + * - The storage layout should match that of `numerator`. + * @param q_size Size of the quotient array for one polynomial. + * @param r_out Pointer to the array where the remainder polynomial(s) will be stored. This is an output parameter. + * - The storage layout should match that of `numerator`. + * - The size of `r_out` should be sufficient to hold the remainder coefficients for each polynomial. * @param r_size Size of the remainder array. * @return eIcicleError Error code indicating success or failure. + * + * @note The degrees should satisfy `numerator_deg >= denominator_deg`. + * The sizes `q_size` and `r_size` must be at least `numerator_deg - denominator_deg + 1` and `denominator_deg`, + * respectively. The function assumes that the input and output arrays are properly allocated. */ + template eIcicleError polynomial_division( const T* numerator, - int64_t numerator_deg, - const T* denumerator, - int64_t denumerator_deg, + uint64_t numerator_size, + const T* denominator, + uint64_t denominator_size, const VecOpsConfig& config, T* q_out /*OUT*/, uint64_t q_size, diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp index d42fa0dca..6e159074f 100644 --- a/icicle/src/vec_ops.cpp +++ b/icicle/src/vec_ops.cpp @@ -3,67 +3,130 @@ namespace icicle { + /*********************************** REDUCE PRODUCT ************************/ + ICICLE_DISPATCHER_INST(VectorProductDispatcher, vector_product, VectorReduceOpImpl); + + extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_product)( + const scalar_t* vec_a, uint64_t size, const VecOpsConfig* config, scalar_t* output) + { + return VectorProductDispatcher::execute(vec_a, size, *config, output); + } + + template <> + eIcicleError vector_product(const scalar_t* vec_a, uint64_t size, const VecOpsConfig& config, scalar_t* output) + { + return CONCAT_EXPAND(FIELD, vector_product)(vec_a, size, &config, output); + } + +#ifdef EXT_FIELD + ICICLE_DISPATCHER_INST(VectorProductExtFieldDispatcher, extension_vector_product, extFieldVectorReduceOpImpl); + + extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_product)( + const extension_t* vec_a, uint64_t size, const VecOpsConfig* config, extension_t* output) + { + return VectorProductExtFieldDispatcher::execute(vec_a, size, *config, output); + } + + template <> + eIcicleError vector_product(const extension_t* vec_a, uint64_t size, const VecOpsConfig& config, extension_t* output) + { + return CONCAT_EXPAND(FIELD, extension_vector_product)(vec_a, size, &config, output); + } +#endif // EXT_FIELD + + /*********************************** REDUCE SUM ****************************/ + ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, VectorReduceOpImpl); + + extern "C" eIcicleError + CONCAT_EXPAND(FIELD, vector_sum)(const scalar_t* vec_a, uint64_t size, const VecOpsConfig* config, scalar_t* output) + { + return VectorSumDispatcher::execute(vec_a, size, *config, output); + } + + template <> + eIcicleError vector_sum(const scalar_t* vec_a, uint64_t size, const VecOpsConfig& config, scalar_t* output) + { + return CONCAT_EXPAND(FIELD, vector_sum)(vec_a, size, &config, output); + } + +#ifdef EXT_FIELD + ICICLE_DISPATCHER_INST(VectorSumExtFieldDispatcher, extension_vector_sum, extFieldVectorReduceOpImpl); + + extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_sum)( + const extension_t* vec_a, uint64_t size, const VecOpsConfig* config, extension_t* output) + { + return VectorSumExtFieldDispatcher::execute(vec_a, size, *config, output); + } + + template <> + eIcicleError vector_sum(const extension_t* vec_a, uint64_t size, const VecOpsConfig& config, extension_t* output) + { + return CONCAT_EXPAND(FIELD, extension_vector_sum)(vec_a, size, &config, output); + } +#endif // EXT_FIELD + /*********************************** ADD ***********************************/ ICICLE_DISPATCHER_INST(VectorAddDispatcher, vector_add, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_add)( - const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output) + const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output) { - return VectorAddDispatcher::execute(vec_a, vec_b, n, *config, output); + return VectorAddDispatcher::execute(vec_a, vec_b, size, *config, output); } template <> eIcicleError - vector_add(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output) + vector_add(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, vector_add)(vec_a, vec_b, n, &config, output); + return CONCAT_EXPAND(FIELD, vector_add)(vec_a, vec_b, size, &config, output); } #ifdef EXT_FIELD ICICLE_DISPATCHER_INST(VectorAddExtFieldDispatcher, extension_vector_add, extFieldVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_add)( - const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig* config, extension_t* output) + const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config, extension_t* output) { - return VectorAddExtFieldDispatcher::execute(vec_a, vec_b, n, *config, output); + return VectorAddExtFieldDispatcher::execute(vec_a, vec_b, size, *config, output); } template <> eIcicleError vector_add( - const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig& config, extension_t* output) + const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config, extension_t* output) { - return CONCAT_EXPAND(FIELD, extension_vector_add)(vec_a, vec_b, n, &config, output); + return CONCAT_EXPAND(FIELD, extension_vector_add)(vec_a, vec_b, size, &config, output); } #endif // EXT_FIELD /*********************************** ACCUMULATE ***********************************/ - ICICLE_DISPATCHER_INST(VectorAccumulateDispatcher, vector_accumulate, scalarVectorOpImplInplaceA); + ICICLE_DISPATCHER_INST(VectorAccumulateDispatcher, vector_accumulate, vectorVectorOpImplInplaceA); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_accumulate)( - scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config) + scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config) { - return VectorAccumulateDispatcher::execute(vec_a, vec_b, n, *config); + return VectorAccumulateDispatcher::execute(vec_a, vec_b, size, *config); } template <> - eIcicleError vector_accumulate(scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config) + eIcicleError vector_accumulate(scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config) { - return CONCAT_EXPAND(FIELD, vector_accumulate)(vec_a, vec_b, n, &config); + return CONCAT_EXPAND(FIELD, vector_accumulate)(vec_a, vec_b, size, &config); } #ifdef EXT_FIELD ICICLE_DISPATCHER_INST(VectorAccumulateExtFieldDispatcher, extension_vector_accumulate, extFieldVectorOpImplInplaceA); extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_accumulate)( - extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig* config) + extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config) { - return VectorAccumulateExtFieldDispatcher::execute(vec_a, vec_b, n, *config); + return VectorAccumulateExtFieldDispatcher::execute(vec_a, vec_b, size, *config); } template <> - eIcicleError vector_accumulate(extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig& config) + eIcicleError + vector_accumulate(extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config) { - return CONCAT_EXPAND(FIELD, extension_vector_accumulate)(vec_a, vec_b, n, &config); + return CONCAT_EXPAND(FIELD, extension_vector_accumulate)(vec_a, vec_b, size, &config); } #endif // EXT_FIELD @@ -71,32 +134,32 @@ namespace icicle { ICICLE_DISPATCHER_INST(VectorSubDispatcher, vector_sub, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_sub)( - const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output) + const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output) { - return VectorSubDispatcher::execute(vec_a, vec_b, n, *config, output); + return VectorSubDispatcher::execute(vec_a, vec_b, size, *config, output); } template <> eIcicleError - vector_sub(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output) + vector_sub(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, vector_sub)(vec_a, vec_b, n, &config, output); + return CONCAT_EXPAND(FIELD, vector_sub)(vec_a, vec_b, size, &config, output); } #ifdef EXT_FIELD ICICLE_DISPATCHER_INST(VectorSubExtFieldDispatcher, extension_vector_sub, extFieldVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_sub)( - const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig* config, extension_t* output) + const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config, extension_t* output) { - return VectorSubExtFieldDispatcher::execute(vec_a, vec_b, n, *config, output); + return VectorSubExtFieldDispatcher::execute(vec_a, vec_b, size, *config, output); } template <> eIcicleError vector_sub( - const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig& config, extension_t* output) + const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config, extension_t* output) { - return CONCAT_EXPAND(FIELD, extension_vector_sub)(vec_a, vec_b, n, &config, output); + return CONCAT_EXPAND(FIELD, extension_vector_sub)(vec_a, vec_b, size, &config, output); } #endif // EXT_FIELD @@ -104,32 +167,32 @@ namespace icicle { ICICLE_DISPATCHER_INST(VectorMulDispatcher, vector_mul, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_mul)( - const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output) + const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output) { - return VectorMulDispatcher::execute(vec_a, vec_b, n, *config, output); + return VectorMulDispatcher::execute(vec_a, vec_b, size, *config, output); } template <> eIcicleError - vector_mul(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output) + vector_mul(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, vector_mul)(vec_a, vec_b, n, &config, output); + return CONCAT_EXPAND(FIELD, vector_mul)(vec_a, vec_b, size, &config, output); } #ifdef EXT_FIELD ICICLE_DISPATCHER_INST(VectorMulExtFieldDispatcher, extension_vector_mul, extFieldVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_mul)( - const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig* config, extension_t* output) + const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config, extension_t* output) { - return VectorMulExtFieldDispatcher::execute(vec_a, vec_b, n, *config, output); + return VectorMulExtFieldDispatcher::execute(vec_a, vec_b, size, *config, output); } template <> eIcicleError vector_mul( - const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig& config, extension_t* output) + const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config, extension_t* output) { - return CONCAT_EXPAND(FIELD, extension_vector_mul)(vec_a, vec_b, n, &config, output); + return CONCAT_EXPAND(FIELD, extension_vector_mul)(vec_a, vec_b, size, &config, output); } #endif // EXT_FIELD @@ -137,80 +200,172 @@ namespace icicle { ICICLE_DISPATCHER_INST(VectorDivDispatcher, vector_div, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_div)( - const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output) + const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output) { - return VectorDivDispatcher::execute(vec_a, vec_b, n, *config, output); + return VectorDivDispatcher::execute(vec_a, vec_b, size, *config, output); } template <> eIcicleError - vector_div(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output) + vector_div(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, vector_div)(vec_a, vec_b, n, &config, output); + return CONCAT_EXPAND(FIELD, vector_div)(vec_a, vec_b, size, &config, output); } +#ifdef EXT_FIELD + ICICLE_DISPATCHER_INST(VectorDivExtFieldDispatcher, extension_vector_div, extFieldVectorOpImpl); + + extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_div)( + const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config, extension_t* output) + { + return VectorDivExtFieldDispatcher::execute(vec_a, vec_b, size, *config, output); + } + + template <> + eIcicleError vector_div( + const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config, extension_t* output) + { + return CONCAT_EXPAND(FIELD, extension_vector_div)(vec_a, vec_b, size, &config, output); + } +#endif // EXT_FIELD + /*********************************** (Scalar + Vector) ELEMENT WISE ***********************************/ ICICLE_DISPATCHER_INST(ScalarAddDispatcher, scalar_add_vec, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_add_vec)( - const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output) + const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output) { - return ScalarAddDispatcher::execute(scalar_a, vec_b, n, *config, output); + return ScalarAddDispatcher::execute(scalar_a, vec_b, size, *config, output); } template <> eIcicleError scalar_add_vec( - const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output) + const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output) + { + return CONCAT_EXPAND(FIELD, scalar_add_vec)(scalar_a, vec_b, size, &config, output); + } + +#ifdef EXT_FIELD + ICICLE_DISPATCHER_INST(ScalarAddExtFieldDispatcher, extension_scalar_add_vec, extFieldVectorOpImpl); + + extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_scalar_add_vec)( + const extension_t* scalar_a, + const extension_t* vec_b, + uint64_t size, + const VecOpsConfig* config, + extension_t* output) { - return CONCAT_EXPAND(FIELD, scalar_add_vec)(scalar_a, vec_b, n, &config, output); + return ScalarAddExtFieldDispatcher::execute(scalar_a, vec_b, size, *config, output); } + template <> + eIcicleError scalar_add_vec( + const extension_t* scalar_a, + const extension_t* vec_b, + uint64_t size, + const VecOpsConfig& config, + extension_t* output) + { + return CONCAT_EXPAND(FIELD, extension_scalar_add_vec)(scalar_a, vec_b, size, &config, output); + } +#endif // EXT_FIELD + /*********************************** (Scalar - Vector) ELEMENT WISE ***********************************/ ICICLE_DISPATCHER_INST(ScalarSubDispatcher, scalar_sub_vec, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_sub_vec)( - const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output) + const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output) { - return ScalarSubDispatcher::execute(scalar_a, vec_b, n, *config, output); + return ScalarSubDispatcher::execute(scalar_a, vec_b, size, *config, output); } template <> eIcicleError scalar_sub_vec( - const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output) + const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, scalar_sub_vec)(scalar_a, vec_b, n, &config, output); + return CONCAT_EXPAND(FIELD, scalar_sub_vec)(scalar_a, vec_b, size, &config, output); } + +#ifdef EXT_FIELD + ICICLE_DISPATCHER_INST(ScalarSubExtFieldDispatcher, extension_scalar_sub_vec, extFieldVectorOpImpl); + + extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_scalar_sub_vec)( + const extension_t* scalar_a, + const extension_t* vec_b, + uint64_t size, + const VecOpsConfig* config, + extension_t* output) + { + return ScalarSubExtFieldDispatcher::execute(scalar_a, vec_b, size, *config, output); + } + + template <> + eIcicleError scalar_sub_vec( + const extension_t* scalar_a, + const extension_t* vec_b, + uint64_t size, + const VecOpsConfig& config, + extension_t* output) + { + return CONCAT_EXPAND(FIELD, extension_scalar_sub_vec)(scalar_a, vec_b, size, &config, output); + } +#endif // EXT_FIELD /*********************************** MUL BY SCALAR ***********************************/ ICICLE_DISPATCHER_INST(ScalarMulDispatcher, scalar_mul_vec, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_mul_vec)( - const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output) + const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output) + { + return ScalarMulDispatcher::execute(scalar_a, vec_b, size, *config, output); + } + + template <> + eIcicleError scalar_mul_vec( + const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output) + { + return CONCAT_EXPAND(FIELD, scalar_mul_vec)(scalar_a, vec_b, size, &config, output); + } + +#ifdef EXT_FIELD + ICICLE_DISPATCHER_INST(ScalarMulExtFieldDispatcher, extension_scalar_mul_vec, extFieldVectorOpImpl); + + extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_scalar_mul_vec)( + const extension_t* scalar_a, + const extension_t* vec_b, + uint64_t size, + const VecOpsConfig* config, + extension_t* output) { - return ScalarMulDispatcher::execute(scalar_a, vec_b, n, *config, output); + return ScalarMulExtFieldDispatcher::execute(scalar_a, vec_b, size, *config, output); } template <> eIcicleError scalar_mul_vec( - const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output) + const extension_t* scalar_a, + const extension_t* vec_b, + uint64_t size, + const VecOpsConfig& config, + extension_t* output) { - return CONCAT_EXPAND(FIELD, scalar_mul_vec)(scalar_a, vec_b, n, &config, output); + return CONCAT_EXPAND(FIELD, extension_scalar_mul_vec)(scalar_a, vec_b, size, &config, output); } +#endif // EXT_FIELD /*********************************** CONVERT MONTGOMERY ***********************************/ ICICLE_DISPATCHER_INST(ScalarConvertMontgomeryDispatcher, scalar_convert_montgomery, scalarConvertMontgomeryImpl) extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_convert_montgomery)( - const scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, scalar_t* output) + const scalar_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig* config, scalar_t* output) { - return ScalarConvertMontgomeryDispatcher::execute(input, size, is_into, *config, output); + return ScalarConvertMontgomeryDispatcher::execute(input, size, is_to_montgomery, *config, output); } template <> - eIcicleError - convert_montgomery(const scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig& config, scalar_t* output) + eIcicleError convert_montgomery( + const scalar_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, scalar_convert_montgomery)(input, size, is_into, &config, output); + return CONCAT_EXPAND(FIELD, scalar_convert_montgomery)(input, size, is_to_montgomery, &config, output); } #ifdef EXT_FIELD @@ -218,16 +373,16 @@ namespace icicle { ExtFieldConvertMontgomeryDispatcher, extension_scalar_convert_montgomery, extFieldConvertMontgomeryImpl) extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_scalar_convert_montgomery)( - const extension_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, extension_t* output) + const extension_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig* config, extension_t* output) { - return ExtFieldConvertMontgomeryDispatcher::execute(input, size, is_into, *config, output); + return ExtFieldConvertMontgomeryDispatcher::execute(input, size, is_to_montgomery, *config, output); } template <> eIcicleError convert_montgomery( - const extension_t* input, uint64_t size, bool is_into, const VecOpsConfig& config, extension_t* output) + const extension_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, extension_t* output) { - return CONCAT_EXPAND(FIELD, extension_scalar_convert_montgomery)(input, size, is_into, &config, output); + return CONCAT_EXPAND(FIELD, extension_scalar_convert_montgomery)(input, size, is_to_montgomery, &config, output); } #endif // EXT_FIELD @@ -271,11 +426,12 @@ namespace icicle { const scalar_t* input, uint64_t offset, uint64_t stride, - uint64_t size, + uint64_t size_in, + uint64_t size_out, const VecOpsConfig* config, scalar_t* output) { - return ScalarSliceDispatcher::execute(input, offset, stride, size, *config, output); + return ScalarSliceDispatcher::execute(input, offset, stride, size_in, size_out, *config, output); } template <> @@ -283,11 +439,31 @@ namespace icicle { const scalar_t* input, uint64_t offset, uint64_t stride, - uint64_t size, + uint64_t size_in, + uint64_t size_out, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, slice)(input, offset, stride, size, &config, output); + return CONCAT_EXPAND(FIELD, slice)(input, offset, stride, size_in, size_out, &config, output); + } + + // Deprecated API + template <> + eIcicleError slice( + const scalar_t* input, + uint64_t offset, + uint64_t stride, + uint64_t size_out, + const VecOpsConfig& config, + scalar_t* output) + { + const auto size_in = offset + stride * (size_out - 1) + 1; // input should be at least that large + ICICLE_LOG_WARNING << "slice api is deprecated and replace with new api. Use new slice api instead"; + if (config.batch_size != 1) { + ICICLE_LOG_ERROR << "deprecated slice API does not support batch"; + return eIcicleError::INVALID_ARGUMENT; + } + return slice(input, offset, stride, size_in, size_out, config, output); } #ifdef EXT_FIELD @@ -297,11 +473,12 @@ namespace icicle { const extension_t* input, uint64_t offset, uint64_t stride, - uint64_t size, + uint64_t size_in, + uint64_t size_out, const VecOpsConfig* config, extension_t* output) { - return ExtFieldSliceDispatcher::execute(input, offset, stride, size, *config, output); + return ExtFieldSliceDispatcher::execute(input, offset, stride, size_in, size_out, *config, output); } template <> @@ -309,15 +486,16 @@ namespace icicle { const extension_t* input, uint64_t offset, uint64_t stride, - uint64_t size, + uint64_t size_in, + uint64_t size_out, const VecOpsConfig& config, extension_t* output) { - return CONCAT_EXPAND(FIELD, extension_slice)(input, offset, stride, size, &config, output); + return CONCAT_EXPAND(FIELD, extension_slice)(input, offset, stride, size_in, size_out, &config, output); } #endif // EXT_FIELD - /*********************************** HIGHEST NON ZERO IDX ***********************************/ + /*********************************** HIGHEST sizeON ZERO IDX ***********************************/ ICICLE_DISPATCHER_INST(ScalarHighestNonZeroIdxDispatcher, highest_non_zero_idx, scalarHighNonZeroIdxOpImpl) @@ -367,25 +545,25 @@ namespace icicle { extern "C" eIcicleError CONCAT_EXPAND(FIELD, poly_division)( const scalar_t* numerator, - int64_t numerator_deg, - const scalar_t* denumerator, - int64_t denumerator_deg, - const VecOpsConfig* config, + uint64_t numerator_size, + const scalar_t* denominator, + int64_t denominator_size, + const VecOpsConfig& config, scalar_t* q_out /*OUT*/, uint64_t q_size, scalar_t* r_out /*OUT*/, uint64_t r_size) { return ScalarPolyDivDispatcher::execute( - numerator, numerator_deg, denumerator, denumerator_deg, *config, q_out, q_size, r_out, r_size); + numerator, numerator_size, denominator, denominator_size, config, q_out, q_size, r_out, r_size); } template <> eIcicleError polynomial_division( const scalar_t* numerator, - int64_t numerator_deg, - const scalar_t* denumerator, - int64_t denumerator_deg, + uint64_t numerator_size, + const scalar_t* denominator, + uint64_t denominator_size, const VecOpsConfig& config, scalar_t* q_out /*OUT*/, uint64_t q_size, @@ -393,7 +571,7 @@ namespace icicle { uint64_t r_size) { return CONCAT_EXPAND(FIELD, poly_division)( - numerator, numerator_deg, denumerator, denumerator_deg, &config, q_out, q_size, r_out, r_size); + numerator, numerator_size, denominator, denominator_size, config, q_out, q_size, r_out, r_size); } } // namespace icicle \ No newline at end of file diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 072142876..703018797 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -1,3 +1,4 @@ +#include #include #include #include "dlfcn.h" @@ -14,6 +15,8 @@ using namespace field_config; using namespace icicle; +// TODO Hadar - add tests that test different configurations of data on device or on host. + using FpMicroseconds = std::chrono::duration; #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now(); #define END_TIMER(timer, msg, enable) \ @@ -22,12 +25,13 @@ using FpMicroseconds = std::chrono::duration s_registered_devices; +bool s_is_cuda_registered; // TODO Yuval remove this -template -class FieldApiTest : public ::testing::Test +class FieldApiTestBase : public ::testing::Test { public: // SetUpTestSuite/TearDownTestSuite are called once for the entire test suite @@ -38,10 +42,11 @@ class FieldApiTest : public ::testing::Test #endif icicle_load_backend_from_env_or_default(); - const bool is_cuda_registered = is_device_registered("CUDA"); - if (!is_cuda_registered) { ICICLE_LOG_ERROR << "CUDA device not found. Testing CPU vs CPU"; } - s_main_target = is_cuda_registered ? "CUDA" : "CPU"; + s_is_cuda_registered = is_device_registered("CUDA"); + if (!s_is_cuda_registered) { ICICLE_LOG_ERROR << "CUDA device not found. Testing CPU vs reference (on cpu)"; } + s_main_target = s_is_cuda_registered ? "CUDA" : "CPU"; s_reference_target = "CPU"; + s_registered_devices = get_registered_devices_list(); } static void TearDownTestSuite() { @@ -52,7 +57,12 @@ class FieldApiTest : public ::testing::Test // SetUp/TearDown are called before and after each test void SetUp() override {} void TearDown() override {} +}; +template +class FieldApiTest : public FieldApiTestBase +{ +public: void random_samples(T* arr, uint64_t count) { for (uint64_t i = 0; i < count; i++) @@ -84,16 +94,24 @@ TYPED_TEST(FieldApiTest, FieldSanityTest) ASSERT_EQ(a * scalar_t::from(2), a + a); } -TYPED_TEST(FieldApiTest, vectorOps) +TYPED_TEST(FieldApiTest, vectorVectorOps) { - const uint64_t N = 1 << 22; - auto in_a = std::make_unique(N); - auto in_b = std::make_unique(N); - FieldApiTest::random_samples(in_a.get(), N); - FieldApiTest::random_samples(in_b.get(), N); + int seed = time(0); + srand(seed); + ICICLE_LOG_DEBUG << "seed = " << seed; + const uint64_t N = 1 << (rand() % 15 + 3); + const int batch_size = 1 << (rand() % 5); + const bool columns_batch = rand() % 2; - auto out_main = std::make_unique(N); - auto out_ref = std::make_unique(N); + ICICLE_LOG_DEBUG << "N = " << N; + ICICLE_LOG_DEBUG << "batch_size = " << batch_size; + ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; + + const int total_size = N * batch_size; + auto in_a = std::make_unique(total_size); + auto in_b = std::make_unique(total_size); + auto out_main = std::make_unique(total_size); + auto out_ref = std::make_unique(total_size); auto vector_accumulate_wrapper = [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) { @@ -105,6 +123,8 @@ TYPED_TEST(FieldApiTest, vectorOps) Device dev = {dev_type, 0}; icicle_set_device(dev); auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; std::ostringstream oss; oss << dev_type << " " << msg; @@ -116,45 +136,329 @@ TYPED_TEST(FieldApiTest, vectorOps) END_TIMER(VECADD_sync, oss.str().c_str(), measure); }; - // warmup - // run(s_reference_target, out_ref.get(), false /*=measure*/, 16 /*=iters*/); - // run(s_main_target, out_main.get(), false /*=measure*/, 1 /*=iters*/); + // add + FieldApiTest::random_samples(in_a.get(), total_size); + FieldApiTest::random_samples(in_b.get(), total_size); + if (!s_is_cuda_registered) { + for (int i = 0; i < total_size; i++) { + out_ref[i] = in_a[i] + in_b[i]; + } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_add, "vector add", ITERS); + } + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_add, "vector add", ITERS); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); // accumulate - auto temp_result = std::make_unique(N); - auto initial_in_a = std::make_unique(N); - - std::memcpy(initial_in_a.get(), in_a.get(), N * sizeof(TypeParam)); - run(s_reference_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS); - std::memcpy(temp_result.get(), in_a.get(), N * sizeof(TypeParam)); - std::memcpy(in_a.get(), initial_in_a.get(), N * sizeof(TypeParam)); + FieldApiTest::random_samples(in_a.get(), total_size); + FieldApiTest::random_samples(in_b.get(), total_size); + for (int i = 0; i < total_size; i++) { // TODO - compare gpu against cpu with inplace operations? + out_ref[i] = in_a[i] + in_b[i]; + } run(s_main_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS); - ASSERT_EQ(0, memcmp(in_a.get(), temp_result.get(), N * sizeof(TypeParam))); - // add - run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_add, "vector add", ITERS); - run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_add, "vector add", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), N * sizeof(TypeParam))); + ASSERT_EQ(0, memcmp(in_a.get(), out_ref.get(), total_size * sizeof(TypeParam))); // sub - run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sub, "vector sub", ITERS); + FieldApiTest::random_samples(in_a.get(), total_size); + FieldApiTest::random_samples(in_b.get(), total_size); + if (!s_is_cuda_registered) { + for (int i = 0; i < total_size; i++) { + out_ref[i] = in_a[i] - in_b[i]; + } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sub, "vector sub", ITERS); + } run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sub, "vector sub", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), N * sizeof(TypeParam))); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); // mul - run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_mul, "vector mul", ITERS); + FieldApiTest::random_samples(in_a.get(), total_size); + FieldApiTest::random_samples(in_b.get(), total_size); + if (!s_is_cuda_registered) { + for (int i = 0; i < total_size; i++) { + out_ref[i] = in_a[i] * in_b[i]; + } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_mul, "vector mul", ITERS); + } run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_mul, "vector mul", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), N * sizeof(TypeParam))); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); + + // div + TypeParam::rand_host_many(in_a.get(), total_size); + TypeParam::rand_host_many(in_b.get(), total_size); + // reference + if (!s_is_cuda_registered) { + for (int i = 0; i < total_size; i++) { + out_ref[i] = in_a[i] * TypeParam::inverse(in_b[i]); + } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_div, "vector div", ITERS); + } + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_div, "vector div", ITERS); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); } -TYPED_TEST(FieldApiTest, matrixAPIsAsync) +TYPED_TEST(FieldApiTest, montgomeryConversion) { - const int R = 1 << 10, C = 1 << 8; - auto h_in = std::make_unique(R * C); - FieldApiTest::random_samples(h_in.get(), R * C); + int seed = time(0); + srand(seed); + ICICLE_LOG_DEBUG << "seed = " << seed; + const uint64_t N = 1 << (rand() % 15 + 3); + const int batch_size = 1 << (rand() % 5); + const bool columns_batch = rand() % 2; + const bool is_to_montgomery = rand() % 2; + ICICLE_LOG_DEBUG << "N = " << N; + ICICLE_LOG_DEBUG << "batch_size = " << batch_size; + ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; + ICICLE_LOG_DEBUG << "is_to_montgomery = " << is_to_montgomery; + const int total_size = N * batch_size; + auto in_a = std::make_unique(total_size); + auto out_main = std::make_unique(total_size); + auto out_ref = std::make_unique(total_size); + + auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) { + Device dev = {dev_type, 0}; + icicle_set_device(dev); + auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; + + std::ostringstream oss; + oss << dev_type << " " << msg; + + START_TIMER(MONTGOMERY) + for (int i = 0; i < iters; ++i) { + ICICLE_CHECK(convert_montgomery(in_a.get(), N, is_to_montgomery, config, out)); + } + END_TIMER(MONTGOMERY, oss.str().c_str(), measure); + }; - auto h_out_main = std::make_unique(R * C); - auto h_out_ref = std::make_unique(R * C); + // convert_montgomery + FieldApiTest::random_samples(in_a.get(), total_size); + // reference + if (!s_is_cuda_registered) { + if (is_to_montgomery) { + for (int i = 0; i < total_size; i++) { + out_ref[i] = TypeParam::to_montgomery(in_a[i]); + } + } else { + for (int i = 0; i < total_size; i++) { + out_ref[i] = TypeParam::from_montgomery(in_a[i]); + } + } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "montgomery", ITERS); + } + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "montgomery", ITERS); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); +} + +TEST_F(FieldApiTestBase, VectorReduceOps) +{ + int seed = time(0); + srand(seed); + ICICLE_LOG_DEBUG << "seed = " << seed; + const uint64_t N = 1 << (rand() % 15 + 3); + const int batch_size = 1 << (rand() % 5); + const bool columns_batch = rand() % 2; + const int total_size = N * batch_size; + + ICICLE_LOG_DEBUG << "N = " << N; + ICICLE_LOG_DEBUG << "batch_size = " << batch_size; + ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; + + auto in_a = std::make_unique(total_size); + auto out_main = std::make_unique(batch_size); + auto out_ref = std::make_unique(batch_size); + + auto vector_accumulate_wrapper = + [](scalar_t* a, const scalar_t* b, uint64_t size, const VecOpsConfig& config, scalar_t* /*out*/) { + return vector_accumulate(a, b, size, config); + }; + + auto run = + [&](const std::string& dev_type, scalar_t* out, bool measure, auto vec_op_func, const char* msg, int iters) { + Device dev = {dev_type, 0}; + icicle_set_device(dev); + auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; + + std::ostringstream oss; + oss << dev_type << " " << msg; + + START_TIMER(VECADD_sync) + for (int i = 0; i < iters; ++i) { + ICICLE_CHECK(vec_op_func(in_a.get(), N, config, out)); + } + END_TIMER(VECADD_sync, oss.str().c_str(), measure); + }; + + // sum + scalar_t::rand_host_many(in_a.get(), total_size); + // reference + for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + out_ref[idx_in_batch] = scalar_t::from(0); + } + if (!s_is_cuda_registered) { + for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) { + uint64_t idx_a = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N; + out_ref[idx_in_batch] = out_ref[idx_in_batch] + in_a[idx_a]; + } + } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sum, "vector sum", ITERS); + } + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sum, "vector sum", ITERS); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(scalar_t))); + + // product + scalar_t::rand_host_many(in_a.get(), total_size); + if (!s_is_cuda_registered) { + for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + out_ref[idx_in_batch] = scalar_t::from(1); + } + for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) { + uint64_t idx_a = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N; + out_ref[idx_in_batch] = out_ref[idx_in_batch] * in_a[idx_a]; + } + } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_product, "vector product", ITERS); + } + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_product, "vector product", ITERS); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(scalar_t))); +} + +TEST_F(FieldApiTestBase, scalarVectorOps) +{ + int seed = time(0); + srand(seed); + ICICLE_LOG_DEBUG << "seed = " << seed; + const uint64_t N = 1 << (rand() % 15 + 3); + const int batch_size = 1 << (rand() % 5); + const bool columns_batch = rand() % 2; + + ICICLE_LOG_DEBUG << "N = " << N; + ICICLE_LOG_DEBUG << "batch_size = " << batch_size; + ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; + + const int total_size = N * batch_size; + auto scalar_a = std::make_unique(batch_size); + auto in_b = std::make_unique(total_size); + auto out_main = std::make_unique(total_size); + auto out_ref = std::make_unique(total_size); + ICICLE_LOG_DEBUG << "N = " << N; + ICICLE_LOG_DEBUG << "batch_size = " << batch_size; + ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; + + auto vector_accumulate_wrapper = + [](scalar_t* a, const scalar_t* b, uint64_t size, const VecOpsConfig& config, scalar_t* /*out*/) { + return vector_accumulate(a, b, size, config); + }; + + auto run = + [&](const std::string& dev_type, scalar_t* out, bool measure, auto vec_op_func, const char* msg, int iters) { + Device dev = {dev_type, 0}; + icicle_set_device(dev); + auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; + + std::ostringstream oss; + oss << dev_type << " " << msg; + + START_TIMER(VECADD_sync) + for (int i = 0; i < iters; ++i) { + ICICLE_CHECK(vec_op_func(scalar_a.get(), in_b.get(), N, config, out)); + } + END_TIMER(VECADD_sync, oss.str().c_str(), measure); + }; + + // scalar add vec + scalar_t::rand_host_many(scalar_a.get(), batch_size); + scalar_t::rand_host_many(in_b.get(), total_size); + + // reference + if (!s_is_cuda_registered) { + for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) { + uint64_t idx_b = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N; + out_ref[idx_b] = (scalar_a[idx_in_batch]) + in_b[idx_b]; + } + } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_add_vec, "scalar add vec", ITERS); + } + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_add_vec, "scalar add vec", ITERS); + + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t))); + + // scalar sub vec + scalar_t::rand_host_many(scalar_a.get(), batch_size); + scalar_t::rand_host_many(in_b.get(), total_size); + + if (!s_is_cuda_registered) { + for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) { + uint64_t idx_b = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N; + out_ref[idx_b] = (scalar_a[idx_in_batch]) - in_b[idx_b]; + } + } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_sub_vec, "scalar sub vec", ITERS); + } + + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_sub_vec, "scalar sub vec", ITERS); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t))); + + // scalar mul vec + scalar_t::rand_host_many(scalar_a.get(), batch_size); + scalar_t::rand_host_many(in_b.get(), total_size); + + if (!s_is_cuda_registered) { + for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) { + uint64_t idx_b = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N; + out_ref[idx_b] = (scalar_a[idx_in_batch]) * in_b[idx_b]; + } + } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_mul_vec, "scalar mul vec", ITERS); + } + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_mul_vec, "scalar mul vec", ITERS); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t))); +} + +TYPED_TEST(FieldApiTest, matrixAPIsAsync) +{ + int seed = time(0); + srand(seed); + ICICLE_LOG_DEBUG << "seed = " << seed; + const int R = + 1 + << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes which are not powers of 2 + const int C = + 1 + << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes which are not powers of 2 + const int batch_size = 1 << (rand() % 4); + const bool columns_batch = rand() % 2; + const bool is_in_place = + s_is_cuda_registered ? 0 : rand() % 2; // TODO - fix inplace (Hadar: I'm not sure we should support it) + + ICICLE_LOG_DEBUG << "rows = " << R; + ICICLE_LOG_DEBUG << "cols = " << C; + ICICLE_LOG_DEBUG << "batch_size = " << batch_size; + ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; + + const int total_size = R * C * batch_size; + auto h_inout = std::make_unique(total_size); + auto h_out_main = std::make_unique(total_size); + auto h_out_ref = std::make_unique(total_size); auto run = [&](const std::string& dev_type, TypeParam* h_out, bool measure, const char* msg, int iters) { Device dev = {dev_type, 0}; @@ -163,6 +467,8 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync) DeviceProperties device_props; icicle_get_device_properties(device_props); auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; std::ostringstream oss; oss << dev_type << " " << msg; @@ -172,16 +478,16 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync) TypeParam *d_in, *d_out; if (!device_props.using_host_memory) { icicle_create_stream(&config.stream); - icicle_malloc_async((void**)&d_in, R * C * sizeof(TypeParam), config.stream); - icicle_malloc_async((void**)&d_out, R * C * sizeof(TypeParam), config.stream); - icicle_copy_to_device_async(d_in, h_in.get(), R * C * sizeof(TypeParam), config.stream); + icicle_malloc_async((void**)&d_in, total_size * sizeof(TypeParam), config.stream); + icicle_malloc_async((void**)&d_out, total_size * sizeof(TypeParam), config.stream); + icicle_copy_to_device_async(d_in, h_inout.get(), total_size * sizeof(TypeParam), config.stream); config.is_a_on_device = true; config.is_result_on_device = true; config.is_async = false; } - TypeParam* in = device_props.using_host_memory ? h_in.get() : d_in; + TypeParam* in = device_props.using_host_memory ? h_inout.get() : d_in; TypeParam* out = device_props.using_host_memory ? h_out : d_out; START_TIMER(TRANSPOSE) @@ -191,106 +497,367 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync) END_TIMER(TRANSPOSE, oss.str().c_str(), measure); if (!device_props.using_host_memory) { - icicle_copy_to_host_async(h_out, d_out, R * C * sizeof(TypeParam), config.stream); + icicle_copy_to_host_async(h_out, d_out, total_size * sizeof(TypeParam), config.stream); icicle_stream_synchronize(config.stream); icicle_free_async(d_in, config.stream); icicle_free_async(d_out, config.stream); } }; - run(s_reference_target, h_out_ref.get(), VERBOSE /*=measure*/, "transpose", ITERS); - run(s_main_target, h_out_main.get(), VERBOSE /*=measure*/, "transpose", ITERS); - ASSERT_EQ(0, memcmp(h_out_main.get(), h_out_ref.get(), R * C * sizeof(TypeParam))); + // Option 1: Initialize each input matrix in the batch with the same ascending values + // for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + // for (uint32_t i = 0; i < R * C; i++) { + // if(columns_batch){ + // h_inout[idx_in_batch + batch_size * i] = TypeParam::from(i); + // } else { + // h_inout[idx_in_batch * R * C + i] = TypeParam::from(i); + // } + // } + // } + + // Option 2: Initialize the entire input array with ascending values + // for (int i = 0; i < total_size; i++) { + // h_inout[i] = TypeParam::from(i); + // } + + // Option 3: Initialize the entire input array with random values + TypeParam::rand_host_many(h_inout.get(), total_size); + + // Reference implementation + if (!s_is_cuda_registered) { + const TypeParam* cur_mat_in = h_inout.get(); + TypeParam* cur_mat_out = h_out_ref.get(); + uint32_t stride = columns_batch ? batch_size : 1; + const uint64_t total_elements_one_mat = static_cast(R) * C; + for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + // Perform the matrix transpose + for (uint32_t i = 0; i < R; ++i) { + for (uint32_t j = 0; j < C; ++j) { + cur_mat_out[stride * (j * R + i)] = cur_mat_in[stride * (i * C + j)]; + } + } + cur_mat_in += (columns_batch ? 1 : total_elements_one_mat); + cur_mat_out += (columns_batch ? 1 : total_elements_one_mat); + } + } else { + run(s_reference_target, (is_in_place ? h_inout.get() : h_out_ref.get()), VERBOSE /*=measure*/, "transpose", ITERS); + } + + run(s_main_target, (is_in_place ? h_inout.get() : h_out_main.get()), VERBOSE /*=measure*/, "transpose", ITERS); + + if (is_in_place) { + ASSERT_EQ(0, memcmp(h_inout.get(), h_out_ref.get(), total_size * sizeof(TypeParam))); + } else { + ASSERT_EQ(0, memcmp(h_out_main.get(), h_out_ref.get(), total_size * sizeof(TypeParam))); + } } -TYPED_TEST(FieldApiTest, montgomeryConversion) +TYPED_TEST(FieldApiTest, bitReverse) { - const uint64_t N = 1 << 18; - auto elements_main = std::make_unique(N); - auto elements_ref = std::make_unique(N); - FieldApiTest::random_samples(elements_main.get(), N); - memcpy(elements_ref.get(), elements_main.get(), N * sizeof(TypeParam)); + int seed = time(0); + srand(seed); + ICICLE_LOG_DEBUG << "seed = " << seed; + const uint64_t N = 1 << (rand() % 15 + 3); + const int batch_size = 1 << (rand() % 5); + const bool columns_batch = rand() % 2; + const bool is_in_place = rand() % 2; + const int total_size = N * batch_size; - auto run = [&](const std::string& dev_type, TypeParam* inout, bool measure, const char* msg, int iters) { + ICICLE_LOG_DEBUG << "N = " << N; + ICICLE_LOG_DEBUG << "batch_size = " << batch_size; + ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; + ICICLE_LOG_DEBUG << "is_in_place = " << is_in_place; + + auto in_a = std::make_unique(total_size); + auto out_main = std::make_unique(total_size); + auto out_ref = std::make_unique(total_size); + + auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) { Device dev = {dev_type, 0}; icicle_set_device(dev); auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; std::ostringstream oss; oss << dev_type << " " << msg; - START_TIMER(MONTGOMERY) + START_TIMER(BIT_REVERSE) for (int i = 0; i < iters; ++i) { - ICICLE_CHECK(convert_montgomery(inout, N, true /*into montgomery*/, config, inout)); + ICICLE_CHECK(bit_reverse(in_a.get(), N, config, out)); } - END_TIMER(MONTGOMERY, oss.str().c_str(), measure); + END_TIMER(BIT_REVERSE, oss.str().c_str(), measure); }; - run(s_reference_target, elements_main.get(), VERBOSE /*=measure*/, "montgomery", 1); - run(s_main_target, elements_ref.get(), VERBOSE /*=measure*/, "montgomery", 1); - ASSERT_EQ(0, memcmp(elements_main.get(), elements_ref.get(), N * sizeof(TypeParam))); + // // Option 1: Initialize each input vector in the batch with the same ascending values + // for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + // for (uint32_t i = 0; i < N; i++) { + // if(columns_batch){ + // in_a[idx_in_batch + batch_size * i] = TypeParam::from(i); + // } else { + // in_a[idx_in_batch * N + i] = TypeParam::from(i); + // } + // } + // } + + // // Option 2: Initialize the entire input array with ascending values + // for (int i = 0; i < total_size; i++) { + // in_a[i] = TypeParam::from(i); + // } + + // Option 3: Initialize the entire input array with random values + FieldApiTest::random_samples(in_a.get(), total_size); + + // Reference implementation + if (!s_is_cuda_registered || is_in_place) { + uint64_t logn = 0; + uint64_t temp = N; + while (temp > 1) { + temp >>= 1; + logn++; + } + // BIT REVERSE FUNCTION + for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + for (uint64_t i = 0; i < N; i++) { + int rev = 0; + for (int j = 0; j < logn; ++j) { + if (i & (1 << j)) { rev |= 1 << (logn - 1 - j); } + } + if (columns_batch) { + out_ref[idx_in_batch + batch_size * i] = in_a[idx_in_batch + batch_size * rev]; + } else { + out_ref[idx_in_batch * N + i] = in_a[idx_in_batch * N + rev]; + } + } + } + } else { + run(s_reference_target, (is_in_place ? in_a.get() : out_ref.get()), VERBOSE /*=measure*/, "bit-reverse", 1); + } + run(s_main_target, (is_in_place ? in_a.get() : out_main.get()), VERBOSE /*=measure*/, "bit-reverse", 1); + + if (is_in_place) { + ASSERT_EQ(0, memcmp(in_a.get(), out_ref.get(), N * sizeof(TypeParam))); + } else { + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); + } } -TYPED_TEST(FieldApiTest, bitReverse) +TYPED_TEST(FieldApiTest, Slice) +{ + int seed = time(0); + srand(seed); + ICICLE_LOG_DEBUG << "seed = " << seed; + const uint64_t size_in = 1 << (rand() % 15 + 5); + const uint64_t offset = rand() % 15; + const uint64_t stride = rand() % 4 + 1; + const uint64_t size_out = rand() % (((size_in - offset) / stride) - 1) + 1; + const int batch_size = 1 << (rand() % 5); + const bool columns_batch = rand() % 2; + + ICICLE_LOG_DEBUG << "size_in = " << size_in; + ICICLE_LOG_DEBUG << "size_out = " << size_out; + ICICLE_LOG_DEBUG << "offset = " << offset; + ICICLE_LOG_DEBUG << "stride = " << stride; + ICICLE_LOG_DEBUG << "batch_size = " << batch_size; + ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; + + const int total_size_in = size_in * batch_size; + const int total_size_out = size_out * batch_size; + + auto in_a = std::make_unique(total_size_in); + auto out_main = std::make_unique(total_size_out); + auto out_ref = std::make_unique(total_size_out); + + TypeParam::rand_host_many(in_a.get(), total_size_in); + + auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) { + Device dev = {dev_type, 0}; + icicle_set_device(dev); + auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; + + std::ostringstream oss; + oss << dev_type << " " << msg; + + START_TIMER(SLICE) + for (int i = 0; i < iters; ++i) { + ICICLE_CHECK(slice(in_a.get(), offset, stride, size_in, size_out, config, out)); + } + END_TIMER(SLICE, oss.str().c_str(), measure); + }; + + // Reference implementation + if (!s_is_cuda_registered) { + for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + for (uint64_t i = 0; i < size_out; i++) { + if (columns_batch) { + out_ref[idx_in_batch + batch_size * i] = in_a[idx_in_batch + batch_size * (offset + i * stride)]; + } else { + out_ref[idx_in_batch * size_out + i] = in_a[idx_in_batch * size_in + (offset + i * stride)]; + } + } + } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "slice", 1); + } + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "slice", 1); + + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size_out * sizeof(TypeParam))); +} + +TEST_F(FieldApiTestBase, highestNonZeroIdx) { - const uint64_t N = 1 << 18; - auto elements_main = std::make_unique(N); - auto elements_ref = std::make_unique(N); - FieldApiTest::random_samples(elements_main.get(), N); - memcpy(elements_ref.get(), elements_main.get(), N * sizeof(TypeParam)); + int seed = time(0); + srand(seed); + ICICLE_LOG_DEBUG << "seed = " << seed; + const uint64_t N = 1 << (rand() % 15 + 3); + const int batch_size = 1 << (rand() % 5); + const bool columns_batch = rand() % 2; + const int total_size = N * batch_size; + + auto in_a = std::make_unique(total_size); + for (int i = 0; i < batch_size; ++i) { + // randomize different rows with zeros in the end + auto size = std::max(int64_t(N) / 4 - i, int64_t(1)); + scalar_t::rand_host_many(in_a.get() + i * N, size); + } + auto out_main = std::make_unique(batch_size); + auto out_ref = std::make_unique(batch_size); - auto run = [&](const std::string& dev_type, TypeParam* inout, bool measure, const char* msg, int iters) { + auto run = [&](const std::string& dev_type, int64_t* out, bool measure, const char* msg, int iters) { Device dev = {dev_type, 0}; icicle_set_device(dev); auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; std::ostringstream oss; oss << dev_type << " " << msg; - START_TIMER(BIT_REVERSE) + START_TIMER(highestNonZeroIdx) for (int i = 0; i < iters; ++i) { - ICICLE_CHECK(bit_reverse(inout, N, config, inout)); + ICICLE_CHECK(highest_non_zero_idx(in_a.get(), N, config, out)); } - END_TIMER(BIT_REVERSE, oss.str().c_str(), measure); + END_TIMER(highestNonZeroIdx, oss.str().c_str(), measure); }; - run(s_reference_target, elements_main.get(), VERBOSE /*=measure*/, "bit-reverse", 1); - run(s_main_target, elements_ref.get(), VERBOSE /*=measure*/, "bit-reverse", 1); - ASSERT_EQ(0, memcmp(elements_main.get(), elements_ref.get(), N * sizeof(TypeParam))); + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "highest_non_zero_idx", 1); + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "highest_non_zero_idx", 1); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(int64_t))); } -TYPED_TEST(FieldApiTest, Slice) +TEST_F(FieldApiTestBase, polynomialEval) { - const uint64_t N = 1 << 18; - const uint64_t offset = 2; - const uint64_t stride = 3; - const uint64_t size = 4; + int seed = time(0); + srand(seed); + ICICLE_LOG_DEBUG << "seed = " << seed; + const uint64_t coeffs_size = 1 << (rand() % 10 + 4); + const uint64_t domain_size = 1 << (rand() % 8 + 2); + const int batch_size = 1 << (rand() % 5); + const bool columns_batch = rand() % 2; + + ICICLE_LOG_DEBUG << "coeffs_size = " << coeffs_size; + ICICLE_LOG_DEBUG << "domain_size = " << domain_size; + ICICLE_LOG_DEBUG << "batch_size = " << batch_size; + ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; + + const int total_coeffs_size = coeffs_size * batch_size; + const int total_result_size = domain_size * batch_size; + + auto in_coeffs = std::make_unique(total_coeffs_size); + auto in_domain = std::make_unique(domain_size); + auto out_main = std::make_unique(total_result_size); + auto out_ref = std::make_unique(total_result_size); + + auto run = [&](const std::string& dev_type, scalar_t* out, bool measure, const char* msg, int iters) { + Device dev = {dev_type, 0}; + icicle_set_device(dev); + auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; - auto elements_main = std::make_unique(N); - auto elements_ref = std::make_unique(size); - auto elements_out = std::make_unique(size); + std::ostringstream oss; + oss << dev_type << " " << msg; - FieldApiTest::random_samples(elements_main.get(), N); + START_TIMER(polynomialEval) + for (int i = 0; i < iters; ++i) { + ICICLE_CHECK(polynomial_eval(in_coeffs.get(), coeffs_size, in_domain.get(), domain_size, config, out)); + } + END_TIMER(polynomialEval, oss.str().c_str(), measure); + }; - auto run = - [&](const std::string& dev_type, const TypeParam* in, TypeParam* out, bool measure, const char* msg, int iters) { - Device dev = {dev_type, 0}; - icicle_set_device(dev); - auto config = VecOpsConfig(); // Adjust configuration as needed + scalar_t::rand_host_many(in_coeffs.get(), total_coeffs_size); + scalar_t::rand_host_many(in_domain.get(), domain_size); - std::ostringstream oss; - oss << dev_type << " " << msg; + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "polynomial_eval", 1); + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "polynomial_eval", 1); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_result_size * sizeof(scalar_t))); +} - START_TIMER(SLICE) - for (int i = 0; i < iters; ++i) { - ICICLE_CHECK(slice(in, offset, stride, size, config, out)); +TEST_F(FieldApiTestBase, polynomialDivision) +{ + const uint64_t numerator_size = 1 << 4; + const uint64_t denominator_size = 1 << 3; + const uint64_t q_size = numerator_size - denominator_size + 1; + const uint64_t r_size = numerator_size; + const int batch_size = 10 + rand() % 10; + + // basically we compute q(x),r(x) for a(x)=q(x)b(x)+r(x) by dividing a(x)/b(x) + + // randomize matrix with rows/cols as polynomials + auto numerator = std::make_unique(numerator_size * batch_size); + auto denominator = std::make_unique(denominator_size * batch_size); + scalar_t::rand_host_many(numerator.get(), numerator_size * batch_size); + scalar_t::rand_host_many(denominator.get(), denominator_size * batch_size); + + // Add padding to each row so that the degree is lower than the size + const int zero_pad_length = 5; + for (int i = 0; i < batch_size; ++i) { + for (int j = 0; j < zero_pad_length; ++j) { + numerator[i * numerator_size + numerator_size - zero_pad_length + j] = scalar_t::zero(); + denominator[i * denominator_size + denominator_size - zero_pad_length + j] = scalar_t::zero(); + } + } + + for (auto device : s_registered_devices) { + ICICLE_CHECK(icicle_set_device(device)); + for (int columns_batch = 0; columns_batch <= 1; columns_batch++) { + ICICLE_LOG_DEBUG << "testing polynomial division on device " << device << " [column_batch=" << columns_batch + << "]"; + auto q = std::make_unique(q_size * batch_size); + auto r = std::make_unique(r_size * batch_size); + + auto config = default_vec_ops_config(); + config.batch_size = columns_batch ? batch_size - zero_pad_length : batch_size; // skip the zero cols + config.columns_batch = columns_batch; + // TODO v3.2 support column batch for this API + if (columns_batch) { + ICICLE_LOG_INFO << "Skipping polynomial division column batch"; + continue; } - END_TIMER(SLICE, oss.str().c_str(), measure); - }; - run(s_reference_target, elements_main.get(), elements_ref.get(), VERBOSE /*=measure*/, "slice", 1); - run(s_main_target, elements_main.get(), elements_out.get(), VERBOSE /*=measure*/, "slice", 1); - ASSERT_EQ(0, memcmp(elements_ref.get(), elements_out.get(), size * sizeof(TypeParam))); + ICICLE_CHECK(polynomial_division( + numerator.get(), numerator_size, denominator.get(), denominator_size, config, q.get(), q_size, r.get(), + r_size)); + + // test a(x)=q(x)b(x)+r(x) in random point + const auto rand_x = scalar_t::rand_host(); + auto ax = std::make_unique(config.batch_size); + auto bx = std::make_unique(config.batch_size); + auto qx = std::make_unique(config.batch_size); + auto rx = std::make_unique(config.batch_size); + polynomial_eval(numerator.get(), numerator_size, &rand_x, 1, config, ax.get()); + polynomial_eval(denominator.get(), denominator_size, &rand_x, 1, config, bx.get()); + polynomial_eval(q.get(), q_size, &rand_x, 1, config, qx.get()); + polynomial_eval(r.get(), r_size, &rand_x, 1, config, rx.get()); + + for (int i = 0; i < config.batch_size; ++i) { + // ICICLE_LOG_DEBUG << "ax=" << ax[i] << ", bx=" << bx[i] << ", qx=" << qx[i] << ", rx=" << rx[i]; + ASSERT_EQ(ax[i], qx[i] * bx[i] + rx[i]); + } + } + } } #ifdef NTT @@ -301,13 +868,15 @@ TYPED_TEST(FieldApiTest, ntt) int seed = time(0); srand(seed); + ICICLE_LOG_DEBUG << "seed = " << seed; const bool inplace = rand() % 2; const int logn = rand() % 15 + 3; const uint64_t N = 1 << logn; const int log_ntt_domain_size = logn + 1; const int log_batch_size = rand() % 3; const int batch_size = 1 << log_batch_size; - const Ordering ordering = static_cast(rand() % 4); + const int _ordering = rand() % 4; + const Ordering ordering = static_cast(_ordering); bool columns_batch; if (logn == 7 || logn < 4) { columns_batch = false; // currently not supported (icicle_v3/backend/cuda/src/ntt/ntt.cuh line 578) @@ -323,9 +892,17 @@ TYPED_TEST(FieldApiTest, ntt) coset_gen = scalar_t::one(); } + ICICLE_LOG_DEBUG << "N = " << N; + ICICLE_LOG_DEBUG << "batch_size = " << batch_size; + ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; + ICICLE_LOG_DEBUG << "inplace = " << inplace; + ICICLE_LOG_DEBUG << "ordering = " << _ordering; + ICICLE_LOG_DEBUG << "log_coset_stride = " << log_coset_stride; + const int total_size = N * batch_size; auto scalars = std::make_unique(total_size); - FieldApiTest::random_samples(scalars.get(), total_size); + TypeParam::rand_host_many(scalars.get(), total_size); + auto out_main = std::make_unique(total_size); auto out_ref = std::make_unique(total_size); auto run = [&](const std::string& dev_type, TypeParam* out, const char* msg, bool measure, int iters) { diff --git a/scripts/release/build_all.sh b/scripts/release/build_all.sh index cbb4b8860..b8050fb70 100755 --- a/scripts/release/build_all.sh +++ b/scripts/release/build_all.sh @@ -32,25 +32,25 @@ docker run --rm --gpus all \ -v ./icicle:/icicle \ -v "$output_dir:/output" \ -v ./scripts:/scripts \ - icicle-release-ubuntu22-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubuntu22 cuda122 & + icicle-release-ubuntu22-cuda122 bash /scripts/release/build_release_and_tar.sh icicle_3_1_0 ubuntu22 cuda122 & # ubuntu 20 docker run --rm --gpus all \ -v ./icicle:/icicle \ -v "$output_dir:/output" \ -v ./scripts:/scripts \ - icicle-release-ubuntu20-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubuntu20 cuda122 & + icicle-release-ubuntu20-cuda122 bash /scripts/release/build_release_and_tar.sh icicle_3_1_0 ubuntu20 cuda122 & # ubi 8 (rhel compatible) docker run --rm --gpus all \ -v ./icicle:/icicle \ -v "$output_dir:/output" \ -v ./scripts:/scripts \ - icicle-release-ubi8-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubi8 cuda122 & + icicle-release-ubi8-cuda122 bash /scripts/release/build_release_and_tar.sh icicle_3_1_0 ubi8 cuda122 & # ubi 9 (rhel compatible) docker run --rm --gpus all \ -v ./icicle:/icicle \ -v "$output_dir:/output" \ -v ./scripts:/scripts \ - icicle-release-ubi9-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubi9 cuda122 & + icicle-release-ubi9-cuda122 bash /scripts/release/build_release_and_tar.sh icicle_3_1_0 ubi9 cuda122 & diff --git a/wrappers/golang/core/vec_ops.go b/wrappers/golang/core/vec_ops.go index 08b87ef08..3671f0653 100644 --- a/wrappers/golang/core/vec_ops.go +++ b/wrappers/golang/core/vec_ops.go @@ -29,7 +29,15 @@ type VecOpsConfig struct { /// non-blocking and you'll need to synchronize it explicitly by calling /// `SynchronizeStream`. If set to false, the function will block the current CPU thread. IsAsync bool - Ext config_extension.ConfigExtensionHandler + /// Number of vectors (or operations) to process in a batch. + /// Each vector operation will be performed independently on each batch element. + /// Default value: 1. + BatchSize int32 + /// True if the batched vectors are stored as columns in a 2D array (i.e., the vectors are + /// strided in memory as columns of a matrix). If false, the batched vectors are stored + /// contiguously in memory (e.g., as rows or in a flat array). Default value: false. + ColumnsBatch bool + Ext config_extension.ConfigExtensionHandler } /** @@ -43,6 +51,8 @@ func DefaultVecOpsConfig() VecOpsConfig { false, // isBOnDevice false, // isResultOnDevice false, // IsAsync + 1, // BatchSize + false, // ColumnsBatch nil, // Ext } diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs index ba22b776d..58e571d52 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs @@ -13,6 +13,8 @@ pub struct VecOpsConfig { pub is_b_on_device: bool, pub is_result_on_device: bool, pub is_async: bool, + pub batch_size: i32, + pub columns_batch: bool, pub ext: ConfigExtension, } @@ -24,6 +26,8 @@ impl VecOpsConfig { is_b_on_device: false, is_result_on_device: false, is_async: false, + batch_size: 1, + columns_batch: false, ext: ConfigExtension::new(), } } @@ -58,6 +62,46 @@ pub trait VecOps { cfg: &VecOpsConfig, ) -> Result<(), eIcicleError>; + fn div( + a: &(impl HostOrDeviceSlice + ?Sized), + b: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError>; + + fn sum( + a: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError>; + + fn product( + a: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError>; + + fn scalar_add( + a: &(impl HostOrDeviceSlice + ?Sized), + b: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError>; + + fn scalar_sub( + a: &(impl HostOrDeviceSlice + ?Sized), + b: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError>; + + fn scalar_mul( + a: &(impl HostOrDeviceSlice + ?Sized), + b: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError>; + fn transpose( input: &(impl HostOrDeviceSlice + ?Sized), nof_rows: u32, @@ -76,6 +120,16 @@ pub trait VecOps { input: &mut (impl HostOrDeviceSlice + ?Sized), cfg: &VecOpsConfig, ) -> Result<(), eIcicleError>; + + fn slice( + input: &(impl HostOrDeviceSlice + ?Sized), + offset: u64, + stride: u64, + size_in: u64, + size_out: u64, + cfg: &VecOpsConfig, + output: &mut (impl HostOrDeviceSlice + ?Sized), + ) -> Result<(), eIcicleError>; } fn check_vec_ops_args<'a, F>( @@ -166,6 +220,88 @@ where <::Config as VecOps>::mul(a, b, result, &cfg) } +pub fn div_scalars( + a: &(impl HostOrDeviceSlice + ?Sized), + b: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, +) -> Result<(), eIcicleError> +where + F: FieldImpl, + ::Config: VecOps, +{ + let cfg = check_vec_ops_args(a, b, result, cfg); + <::Config as VecOps>::div(a, b, result, &cfg) +} + +pub fn sum_scalars( + a: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, +) -> Result<(), eIcicleError> +where + F: FieldImpl, + ::Config: VecOps, +{ + let cfg = check_vec_ops_args(a, a, result, cfg); //TODO: emirsoyturk + <::Config as VecOps>::sum(a, result, &cfg) +} + +pub fn product_scalars( + a: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, +) -> Result<(), eIcicleError> +where + F: FieldImpl, + ::Config: VecOps, +{ + let cfg = check_vec_ops_args(a, a, result, cfg); //TODO: emirsoyturk + <::Config as VecOps>::product(a, result, &cfg) +} + +pub fn scalar_add( + a: &(impl HostOrDeviceSlice + ?Sized), + b: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, +) -> Result<(), eIcicleError> +where + F: FieldImpl, + ::Config: VecOps, +{ + let cfg = check_vec_ops_args(b, b, result, cfg); //TODO: emirsoyturk + <::Config as VecOps>::scalar_add(a, b, result, &cfg) +} + +pub fn scalar_sub( + a: &(impl HostOrDeviceSlice + ?Sized), + b: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, +) -> Result<(), eIcicleError> +where + F: FieldImpl, + ::Config: VecOps, +{ + let cfg = check_vec_ops_args(b, b, result, cfg); //TODO: emirsoyturk + <::Config as VecOps>::scalar_sub(a, b, result, &cfg) +} + +pub fn scalar_mul( + a: &(impl HostOrDeviceSlice + ?Sized), + b: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, +) -> Result<(), eIcicleError> +where + F: FieldImpl, + ::Config: VecOps, +{ + let cfg = check_vec_ops_args(b, b, result, cfg); //TODO: emirsoyturk + <::Config as VecOps>::scalar_mul(a, b, result, &cfg) +} + pub fn transpose_matrix( input: &(impl HostOrDeviceSlice + ?Sized), nof_rows: u32, @@ -205,6 +341,22 @@ where <::Config as VecOps>::bit_reverse_inplace(input, &cfg) } +pub fn slice( + input: &(impl HostOrDeviceSlice + ?Sized), + offset: u64, + stride: u64, + size_in: u64, + size_out: u64, + cfg: &VecOpsConfig, + output: &mut (impl HostOrDeviceSlice + ?Sized), +) -> Result<(), eIcicleError> +where + F: FieldImpl, + ::Config: VecOps, +{ + <::Config as VecOps>::slice(input, offset, stride, size_in, size_out, &cfg, output) +} + #[macro_export] macro_rules! impl_vec_ops_field { ( @@ -255,6 +407,58 @@ macro_rules! impl_vec_ops_field { result: *mut $field, ) -> eIcicleError; + #[link_name = concat!($field_prefix, "_vector_div")] + pub(crate) fn vector_div_ffi( + a: *const $field, + b: *const $field, + size: u32, + cfg: *const VecOpsConfig, + result: *mut $field, + ) -> eIcicleError; + + #[link_name = concat!($field_prefix, "_vector_sum")] + pub(crate) fn vector_sum_ffi( + a: *const $field, + size: u32, + cfg: *const VecOpsConfig, + result: *mut $field, + ) -> eIcicleError; + + #[link_name = concat!($field_prefix, "_vector_product")] + pub(crate) fn vector_product_ffi( + a: *const $field, + size: u32, + cfg: *const VecOpsConfig, + result: *mut $field, + ) -> eIcicleError; + + #[link_name = concat!($field_prefix, "_scalar_add_vec")] + pub(crate) fn scalar_add_ffi( + a: *const $field, + b: *const $field, + size: u32, + cfg: *const VecOpsConfig, + result: *mut $field, + ) -> eIcicleError; + + #[link_name = concat!($field_prefix, "_scalar_sub_vec")] + pub(crate) fn scalar_sub_ffi( + a: *const $field, + b: *const $field, + size: u32, + cfg: *const VecOpsConfig, + result: *mut $field, + ) -> eIcicleError; + + #[link_name = concat!($field_prefix, "_scalar_mul_vec")] + pub(crate) fn scalar_mul_ffi( + a: *const $field, + b: *const $field, + size: u32, + cfg: *const VecOpsConfig, + result: *mut $field, + ) -> eIcicleError; + #[link_name = concat!($field_prefix, "_matrix_transpose")] pub(crate) fn matrix_transpose_ffi( input: *const $field, @@ -271,6 +475,17 @@ macro_rules! impl_vec_ops_field { config: *const VecOpsConfig, output: *mut $field, ) -> eIcicleError; + + #[link_name = concat!($field_prefix, "_slice")] + pub(crate) fn slice_ffi( + input: *const $field, + offset: u64, + stride: u64, + size_in: u64, + size_out: u64, + cfg: *const VecOpsConfig, + output: *mut $field, + ) -> eIcicleError; } } @@ -345,6 +560,110 @@ macro_rules! impl_vec_ops_field { } } + fn div( + a: &(impl HostOrDeviceSlice<$field> + ?Sized), + b: &(impl HostOrDeviceSlice<$field> + ?Sized), + result: &mut (impl HostOrDeviceSlice<$field> + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError> { + unsafe { + $field_prefix_ident::vector_div_ffi( + a.as_ptr(), + b.as_ptr(), + a.len() as u32, + cfg as *const VecOpsConfig, + result.as_mut_ptr(), + ) + .wrap() + } + } + + fn sum( + a: &(impl HostOrDeviceSlice<$field> + ?Sized), + result: &mut (impl HostOrDeviceSlice<$field> + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError> { + unsafe { + $field_prefix_ident::vector_sum_ffi( + a.as_ptr(), + a.len() as u32, + cfg as *const VecOpsConfig, + result.as_mut_ptr(), + ) + .wrap() + } + } + + fn product( + a: &(impl HostOrDeviceSlice<$field> + ?Sized), + result: &mut (impl HostOrDeviceSlice<$field> + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError> { + unsafe { + $field_prefix_ident::vector_sum_ffi( + a.as_ptr(), + a.len() as u32, + cfg as *const VecOpsConfig, + result.as_mut_ptr(), + ) + .wrap() + } + } + + fn scalar_add( + a: &(impl HostOrDeviceSlice<$field> + ?Sized), + b: &(impl HostOrDeviceSlice<$field> + ?Sized), + result: &mut (impl HostOrDeviceSlice<$field> + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError> { + unsafe { + $field_prefix_ident::scalar_add_ffi( + a.as_ptr(), + b.as_ptr(), + b.len() as u32, + cfg as *const VecOpsConfig, + result.as_mut_ptr(), + ) + .wrap() + } + } + + fn scalar_sub( + a: &(impl HostOrDeviceSlice<$field> + ?Sized), + b: &(impl HostOrDeviceSlice<$field> + ?Sized), + result: &mut (impl HostOrDeviceSlice<$field> + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError> { + unsafe { + $field_prefix_ident::scalar_sub_ffi( + a.as_ptr(), + b.as_ptr(), + b.len() as u32, + cfg as *const VecOpsConfig, + result.as_mut_ptr(), + ) + .wrap() + } + } + + fn scalar_mul( + a: &(impl HostOrDeviceSlice<$field> + ?Sized), + b: &(impl HostOrDeviceSlice<$field> + ?Sized), + result: &mut (impl HostOrDeviceSlice<$field> + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError> { + unsafe { + $field_prefix_ident::scalar_mul_ffi( + a.as_ptr(), + b.as_ptr(), + b.len() as u32, + cfg as *const VecOpsConfig, + result.as_mut_ptr(), + ) + .wrap() + } + } + fn transpose( input: &(impl HostOrDeviceSlice<$field> + ?Sized), nof_rows: u32, @@ -394,6 +713,29 @@ macro_rules! impl_vec_ops_field { .wrap() } } + + fn slice( + input: &(impl HostOrDeviceSlice<$field> + ?Sized), + offset: u64, + stride: u64, + size_in: u64, + size_out: u64, + cfg: &VecOpsConfig, + output: &mut (impl HostOrDeviceSlice<$field> + ?Sized), + ) -> Result<(), eIcicleError> { + unsafe { + $field_prefix_ident::slice_ffi( + input.as_ptr(), + offset, + stride, + size_in, + size_out, + cfg as *const VecOpsConfig, + output.as_mut_ptr(), + ) + .wrap() + } + } } }; } @@ -436,6 +778,12 @@ macro_rules! impl_vec_ops_tests { initialize(); check_bit_reverse_inplace::<$field>() } + + #[test] + pub fn test_slice() { + initialize(); + check_slice::<$field>() + } } }; } diff --git a/wrappers/rust/icicle-core/src/vec_ops/tests.rs b/wrappers/rust/icicle-core/src/vec_ops/tests.rs index 6762f06c9..0dbd4c9a3 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/tests.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/tests.rs @@ -2,8 +2,9 @@ use crate::test_utilities; use crate::traits::GenerateRandom; use crate::vec_ops::{ - accumulate_scalars, add_scalars, bit_reverse, bit_reverse_inplace, mul_scalars, sub_scalars, transpose_matrix, - FieldImpl, VecOps, VecOpsConfig, + accumulate_scalars, add_scalars, bit_reverse, bit_reverse_inplace, div_scalars, mul_scalars, product_scalars, + scalar_add, scalar_mul, scalar_sub, slice, sub_scalars, sum_scalars, transpose_matrix, FieldImpl, VecOps, + VecOpsConfig, }; use icicle_runtime::device::Device; use icicle_runtime::memory::{DeviceVec, HostSlice}; @@ -44,6 +45,12 @@ where check_vec_ops_scalars_add::(test_size); check_vec_ops_scalars_sub::(test_size); check_vec_ops_scalars_mul::(test_size); + check_vec_ops_scalars_div::(test_size); + check_vec_ops_scalars_sum::(test_size); + check_vec_ops_scalars_product::(test_size); + check_vec_ops_scalars_add_scalar::(test_size); + check_vec_ops_scalars_sub_scalar::(test_size); + check_vec_ops_scalars_mul_scalar::(test_size); check_vec_ops_scalars_accumulate::(test_size); } @@ -140,6 +147,191 @@ where .unwrap(); } +pub fn check_vec_ops_scalars_div(test_size: usize) +where + ::Config: VecOps + GenerateRandom, +{ + let a_main = F::Config::generate_random(test_size); + let b = F::Config::generate_random(test_size); + let mut result_main = vec![F::zero(); test_size]; + let mut result_ref = vec![F::zero(); test_size]; + + let a_main = HostSlice::from_slice(&a_main); + let b = HostSlice::from_slice(&b); + let result_main = HostSlice::from_mut_slice(&mut result_main); + let result_ref = HostSlice::from_mut_slice(&mut result_ref); + + let mut stream = IcicleStream::create().unwrap(); + let mut cfg = VecOpsConfig::default(); + cfg.stream_handle = *stream; + + test_utilities::test_set_main_device(); + div_scalars(a_main, b, result_main, &cfg).unwrap(); + + test_utilities::test_set_ref_device(); + div_scalars(a_main, b, result_ref, &cfg).unwrap(); + + assert_eq!(result_main.as_slice(), result_ref.as_slice()); + + stream + .destroy() + .unwrap(); +} + +pub fn check_vec_ops_scalars_sum(test_size: usize) +where + ::Config: VecOps + GenerateRandom, +{ + let a_main = F::Config::generate_random(test_size); + let mut result_main = vec![F::zero(); test_size]; + let mut result_ref = vec![F::zero(); test_size]; + + let a_main = HostSlice::from_slice(&a_main); + let result_main = HostSlice::from_mut_slice(&mut result_main); + let result_ref = HostSlice::from_mut_slice(&mut result_ref); + + let mut stream = IcicleStream::create().unwrap(); + let mut cfg = VecOpsConfig::default(); + cfg.stream_handle = *stream; + + test_utilities::test_set_main_device(); + sum_scalars(a_main, result_main, &cfg).unwrap(); + + test_utilities::test_set_ref_device(); + sum_scalars(a_main, result_ref, &cfg).unwrap(); + + assert_eq!(result_main.as_slice(), result_ref.as_slice()); + + stream + .destroy() + .unwrap(); +} + +pub fn check_vec_ops_scalars_product(test_size: usize) +where + ::Config: VecOps + GenerateRandom, +{ + let a_main = F::Config::generate_random(test_size); + let mut result_main = vec![F::zero(); test_size]; + let mut result_ref = vec![F::zero(); test_size]; + + let a_main = HostSlice::from_slice(&a_main); + let result_main = HostSlice::from_mut_slice(&mut result_main); + let result_ref = HostSlice::from_mut_slice(&mut result_ref); + + let mut stream = IcicleStream::create().unwrap(); + let mut cfg = VecOpsConfig::default(); + cfg.stream_handle = *stream; + + test_utilities::test_set_main_device(); + product_scalars(a_main, result_main, &cfg).unwrap(); + + test_utilities::test_set_ref_device(); + product_scalars(a_main, result_ref, &cfg).unwrap(); + + assert_eq!(result_main.as_slice(), result_ref.as_slice()); + + stream + .destroy() + .unwrap(); +} + +pub fn check_vec_ops_scalars_add_scalar(test_size: usize) +where + ::Config: VecOps + GenerateRandom, +{ + let a_main = F::Config::generate_random(1); + let b = F::Config::generate_random(test_size); + let mut result_main = vec![F::zero(); test_size]; + let mut result_ref = vec![F::zero(); test_size]; + + let a_main = HostSlice::from_slice(&a_main); + let b = HostSlice::from_slice(&b); + let result_main = HostSlice::from_mut_slice(&mut result_main); + let result_ref = HostSlice::from_mut_slice(&mut result_ref); + + let mut stream = IcicleStream::create().unwrap(); + let mut cfg = VecOpsConfig::default(); + cfg.stream_handle = *stream; + cfg.batch_size = 1; + + test_utilities::test_set_main_device(); + scalar_add(a_main, b, result_main, &cfg).unwrap(); + + test_utilities::test_set_ref_device(); + scalar_add(a_main, b, result_ref, &cfg).unwrap(); + + assert_eq!(result_main.as_slice(), result_ref.as_slice()); + + stream + .destroy() + .unwrap(); +} + +pub fn check_vec_ops_scalars_sub_scalar(test_size: usize) +where + ::Config: VecOps + GenerateRandom, +{ + let a_main = F::Config::generate_random(1); + let b = F::Config::generate_random(test_size); + let mut result_main = vec![F::zero(); test_size]; + let mut result_ref = vec![F::zero(); test_size]; + + let a_main = HostSlice::from_slice(&a_main); + let b = HostSlice::from_slice(&b); + let result_main = HostSlice::from_mut_slice(&mut result_main); + let result_ref = HostSlice::from_mut_slice(&mut result_ref); + + let mut stream = IcicleStream::create().unwrap(); + let mut cfg = VecOpsConfig::default(); + cfg.stream_handle = *stream; + cfg.batch_size = 1; + + test_utilities::test_set_main_device(); + scalar_sub(a_main, b, result_main, &cfg).unwrap(); + + test_utilities::test_set_ref_device(); + scalar_sub(a_main, b, result_ref, &cfg).unwrap(); + + assert_eq!(result_main.as_slice(), result_ref.as_slice()); + + stream + .destroy() + .unwrap(); +} + +pub fn check_vec_ops_scalars_mul_scalar(test_size: usize) +where + ::Config: VecOps + GenerateRandom, +{ + let a_main = F::Config::generate_random(1); + let b = F::Config::generate_random(test_size); + let mut result_main = vec![F::zero(); test_size]; + let mut result_ref = vec![F::zero(); test_size]; + + let a_main = HostSlice::from_slice(&a_main); + let b = HostSlice::from_slice(&b); + let result_main = HostSlice::from_mut_slice(&mut result_main); + let result_ref = HostSlice::from_mut_slice(&mut result_ref); + + let mut stream = IcicleStream::create().unwrap(); + let mut cfg = VecOpsConfig::default(); + cfg.stream_handle = *stream; + cfg.batch_size = 1; + + test_utilities::test_set_main_device(); + scalar_mul(a_main, b, result_main, &cfg).unwrap(); + + test_utilities::test_set_ref_device(); + scalar_mul(a_main, b, result_ref, &cfg).unwrap(); + + assert_eq!(result_main.as_slice(), result_ref.as_slice()); + + stream + .destroy() + .unwrap(); +} + pub fn check_vec_ops_scalars_accumulate(test_size: usize) where ::Config: VecOps + GenerateRandom, @@ -205,6 +397,47 @@ where assert_eq!(result_main, result_ref); } +pub fn check_slice() +where + ::Config: VecOps + GenerateRandom, +{ + let size_in: u64 = 1 << 10; + let offset: u64 = 10; + let stride: u64 = 3; + let size_out: u64 = ((size_in - offset) / stride) - 1; + + let input_matrix = F::Config::generate_random(size_in as usize); + let mut result_main = vec![F::zero(); size_out as usize]; + let mut result_ref = vec![F::zero(); size_out as usize]; + + let cfg = VecOpsConfig::default(); + test_utilities::test_set_main_device(); + slice( + HostSlice::from_slice(&input_matrix), + offset, + stride, + size_in, + size_out, + &cfg, + HostSlice::from_mut_slice(&mut result_main), + ) + .unwrap(); + + test_utilities::test_set_ref_device(); + slice( + HostSlice::from_slice(&input_matrix), + offset, + stride, + size_in, + size_out, + &cfg, + HostSlice::from_mut_slice(&mut result_ref), + ) + .unwrap(); + + assert_eq!(result_main, result_ref); +} + pub fn check_bit_reverse() where ::Config: VecOps + GenerateRandom,