From f651c59d106fb60708448b7a5fef1c3eebbf56fa Mon Sep 17 00:00:00 2001 From: Stas Polonsky Date: Tue, 10 Sep 2024 21:38:06 +0000 Subject: [PATCH 01/43] initial edits --- icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 6 ++--- icicle/include/icicle/vec_ops.h | 9 +++++++ icicle/src/vec_ops.cpp | 25 ++++++++++++++++++++ 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index 3a2156d60..e3f7532aa 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -362,8 +362,7 @@ eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, co return eIcicleError::SUCCESS; } -// Once backend will support - uncomment the following line -// REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum); +REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum); /*********************************** SUM ***********************************/ template eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output) @@ -387,8 +386,7 @@ eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n return eIcicleError::SUCCESS; } -// Once backend will support - uncomment the following line -// REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum); +REGISTER_VECTOR_PRODUCT_BACKEND("CPU", cpu_vector_product); /*********************************** MUL BY SCALAR***********************************/ template diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h index 735aaf65c..f29ccd335 100644 --- a/icicle/include/icicle/vec_ops.h +++ b/icicle/include/icicle/vec_ops.h @@ -46,6 +46,15 @@ namespace icicle { return config; } + // Reduction operations + + template + eIcicleError vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output); + + template + eIcicleError vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output); + + // Element-wise vector operations /** diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp index d42fa0dca..ad44767a5 100644 --- a/icicle/src/vec_ops.cpp +++ b/icicle/src/vec_ops.cpp @@ -3,6 +3,31 @@ namespace icicle { + + /*********************************** REDUCE PRODUCT ************************/ + ICICLE_DISPATCHER_INST(VectorProductDispatcher, vector_product, scalarVectorOpImpl /* @@@ confirm this argument */); + + // TODO: extern "C" for FFI + + template <> + eIcicleError + vector_product(const scalar_t* vec_a, uint64_t n, const VecOpsConfig& config, scalar_t* output) + { + return CONCAT_EXPAND(FIELD, vector_product)(vec_a, n, &config, output); + } + + /*********************************** REDUCE SUM ****************************/ + ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, scalarVectorOpImpl /* @@@ confirm this argument */); + + // TODO: extern "C" for FFI + + template <> + eIcicleError + vector_sum(const scalar_t* vec_a, uint64_t n, const VecOpsConfig& config, scalar_t* output) + { + return CONCAT_EXPAND(FIELD, vector_sum)(vec_a, n, &config, output); + } + /*********************************** ADD ***********************************/ ICICLE_DISPATCHER_INST(VectorAddDispatcher, vector_add, scalarVectorOpImpl); From 64d4414c0899def332fdce7775bb9da6b77f5312 Mon Sep 17 00:00:00 2001 From: Stas Polonsky Date: Fri, 13 Sep 2024 21:14:56 +0000 Subject: [PATCH 02/43] vector_sum issue --- .../c++/vector-api/.devcontainer/Dockerfile | 25 ++++ .../.devcontainer/devcontainer.json | 22 +++ examples/c++/vector-api/CMakeLists.txt | 16 +++ examples/c++/vector-api/README.md | 32 +++++ examples/c++/vector-api/example.cpp | 136 ++++++++++++++++++ examples/c++/vector-api/run.sh | 66 +++++++++ icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 44 +++++- icicle/include/icicle/api/babybear.h | 48 +++---- icicle/include/icicle/api/bls12_377.h | 42 +++--- icicle/include/icicle/api/bls12_381.h | 42 +++--- icicle/include/icicle/api/bn254.h | 42 +++--- icicle/include/icicle/api/bw6_761.h | 42 +++--- icicle/include/icicle/api/grumpkin.h | 24 ++-- icicle/include/icicle/api/stark252.h | 10 +- .../include/icicle/backend/vec_ops_backend.h | 31 ++++ icicle/include/icicle/vec_ops.h | 26 +++- icicle/src/vec_ops.cpp | 16 ++- 17 files changed, 531 insertions(+), 133 deletions(-) create mode 100644 examples/c++/vector-api/.devcontainer/Dockerfile create mode 100644 examples/c++/vector-api/.devcontainer/devcontainer.json create mode 100644 examples/c++/vector-api/CMakeLists.txt create mode 100644 examples/c++/vector-api/README.md create mode 100644 examples/c++/vector-api/example.cpp create mode 100755 examples/c++/vector-api/run.sh diff --git a/examples/c++/vector-api/.devcontainer/Dockerfile b/examples/c++/vector-api/.devcontainer/Dockerfile new file mode 100644 index 000000000..64188da96 --- /dev/null +++ b/examples/c++/vector-api/.devcontainer/Dockerfile @@ -0,0 +1,25 @@ +# Make sure NVIDIA Container Toolkit is installed on your host + +# Use the specified base image +FROM nvidia/cuda:12.0.0-devel-ubuntu22.04 + +# Update and install dependencies +RUN apt-get update && apt-get install -y \ + cmake \ + curl \ + build-essential \ + git \ + libboost-all-dev \ + && rm -rf /var/lib/apt/lists/* + +# Clone Icicle from a GitHub repository +RUN git clone https://github.com/ingonyama-zk/icicle.git /icicle + +# Set the working directory in the container +WORKDIR /icicle-example + +# Specify the default command for the container +CMD ["/bin/bash"] + + + diff --git a/examples/c++/vector-api/.devcontainer/devcontainer.json b/examples/c++/vector-api/.devcontainer/devcontainer.json new file mode 100644 index 000000000..490fe90a6 --- /dev/null +++ b/examples/c++/vector-api/.devcontainer/devcontainer.json @@ -0,0 +1,22 @@ +{ + "name": "Icicle Examples: polynomial multiplication", + "build": { + "dockerfile": "Dockerfile" + }, + "runArgs": [ + "--gpus", + "all" + ], + "postCreateCommand": [ + "nvidia-smi" + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-vscode.cmake-tools", + "ms-python.python", + "ms-vscode.cpptools" + ] + } + } +} \ No newline at end of file diff --git a/examples/c++/vector-api/CMakeLists.txt b/examples/c++/vector-api/CMakeLists.txt new file mode 100644 index 000000000..c32f17f43 --- /dev/null +++ b/examples/c++/vector-api/CMakeLists.txt @@ -0,0 +1,16 @@ +cmake_minimum_required(VERSION 3.18) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED TRUE) + +project(example) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") +add_executable(example example.cpp) +target_include_directories(example PRIVATE "../../../icicle/include" "..") +target_link_directories(example PRIVATE "${CMAKE_SOURCE_DIR}/build/icicle") +message("${CMAKE_BINARY_DIR}/icicle") +target_link_libraries(example PRIVATE icicle_curve_bn254 icicle_field_bn254 icicle_device) +if(BACKEND_DIR) + add_compile_definitions(BACKEND_DIR="${BACKEND_DIR}") +endif() + diff --git a/examples/c++/vector-api/README.md b/examples/c++/vector-api/README.md new file mode 100644 index 000000000..46c556339 --- /dev/null +++ b/examples/c++/vector-api/README.md @@ -0,0 +1,32 @@ +# Icicle Example: Vector Operations API + +TBD + +## Key-Takeaway + +Icicle provides polynomial multiplication using the Number Theoretical Transform (NTT), including forward and inverse transforms. + +## Concise Usage Explanation + +1. Include the necessary headers. +2. Initialize the NTT domain. +3. Prepare and transform the polynomials from host to device memory. +4. Perform pointwise multiplication. +5. Apply the inverse NTT. + +## Running the example + +```sh +# for CPU +./run.sh -d CPU +# for CUDA +./run.sh -d CUDA -b /path/to/cuda/backend/install/dir +``` + +## What's in the example + +1. Define the size of the example. +2. Initialize input polynomials. +3. Perform Radix-2 or Mixed-Radix NTT. +4. Perform pointwise polynomial multiplication. +5. Apply the inverse NTT. diff --git a/examples/c++/vector-api/example.cpp b/examples/c++/vector-api/example.cpp new file mode 100644 index 000000000..ca653abe9 --- /dev/null +++ b/examples/c++/vector-api/example.cpp @@ -0,0 +1,136 @@ +#include +#include +#include + +#include "icicle/runtime.h" +#include "icicle/api/bn254.h" +#include "icicle/utils/log.h" + + +// SP: I undertstand this code is auto-generated, but I can't get scrip/gen to work. + +extern "C" eIcicleError bn254_vector_product( + const bn254::scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result); + +extern "C" eIcicleError bn254_vector_sum( + const bn254::scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result); + +// SP: end of my changes + +using namespace bn254; + +#include "examples_utils.h" + +void random_samples(scalar_t* res, uint32_t count) +{ + for (int i = 0; i < count; i++) + res[i] = i < 1000 ? scalar_t::rand_host() : res[i - 1000]; +} + +// void incremental_values(scalar_t* res, uint32_t count) +// { +// for (int i = 0; i < count; i++) { +// res[i] = i ? res[i - 1] + scalar_t::one() : scalar_t::zero(); +// } +// } + +int main(int argc, char** argv) +{ + try_load_and_set_backend_device(argc, argv); + + int N_LOG = 20; + int N = 1 << N_LOG; + + // on-host data + auto h_a = std::make_unique(N); + auto h_b = std::make_unique(N); + auto h_out = std::make_unique(N); + + random_samples(h_a.get(), N ); + random_samples(h_b.get(), N ); + + // on-device data + scalar_t *d_a, *d_b, *d_out; + + DeviceProperties device_props; + ICICLE_CHECK(icicle_get_device_properties(device_props)); + + ICICLE_CHECK(icicle_malloc((void**)&d_a, sizeof(scalar_t) * N)); + ICICLE_CHECK(icicle_malloc((void**)&d_b, sizeof(scalar_t) * N)); + ICICLE_CHECK(icicle_malloc((void**)&d_out, sizeof(scalar_t) * N)); + + ICICLE_CHECK(icicle_copy(d_a, h_a.get(), sizeof(scalar_t) * N)); + ICICLE_CHECK(icicle_copy(d_b, h_b.get(), sizeof(scalar_t) * N)); + + VecOpsConfig h_config{ + nullptr, + false, // is_a_on_device + false, // is_b_on_device + false, // is_result_on_device + false, // is_async + nullptr // ext + }; + + VecOpsConfig d_config{ + nullptr, + true, // is_a_on_device + true, // is_b_on_device + true, // is_result_on_device + false, // is_async + nullptr // ext + }; + + + // Reduction operations + + START_TIMER(baseline_reduce_sum); + h_out[0] = scalar_t::zero(); + for (uint64_t i = 0; i < N; ++i) { + h_out[0] = h_out[0] + h_a[i]; + } + END_TIMER(baseline_reduce_sum, "baseline reduce sum took"); + + ICICLE_LOG_INFO << "Failed to load "; + std::cout << "ext: " << std::endl; + // d_config.ext = 2; + std::cout << "ext: " << d_config.ext << std::endl; + + // return 0; + + START_TIMER(reduce_sum); + ICICLE_CHECK(bn254_vector_sum(d_a, N, &d_config, d_out)); + END_TIMER(reduce_sum, "reduce sum took"); + + + std::cout << "h_out: " << h_out[0] << std::endl; + std::cout << "d_out: " << d_out[0] << std::endl; + + + + + START_TIMER(baseline_reduce_product); + h_out[0] = scalar_t::one(); + for (uint64_t i = 0; i < N; ++i) { + h_out[0] = h_out[0] * h_a[i]; + } + END_TIMER(baseline_reduce_product, "baseline reduce product took"); + + + START_TIMER(reduce_product); + ICICLE_CHECK(bn254_vector_product(d_a, N, &d_config, d_out)); + END_TIMER(reduce_product, "reduce product took"); + + + std::cout << "h_out: " << h_out[0] << std::endl; + std::cout << "d_out: " << d_out[0] << std::endl; + + + + + + ICICLE_CHECK(icicle_free(d_a)); + ICICLE_CHECK(icicle_free(d_b)); + ICICLE_CHECK(icicle_free(d_out)); + + return 0; +} \ No newline at end of file diff --git a/examples/c++/vector-api/run.sh b/examples/c++/vector-api/run.sh new file mode 100755 index 000000000..879390d0a --- /dev/null +++ b/examples/c++/vector-api/run.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# Exit immediately if a command exits with a non-zero status +set -e + +# Function to display usage information +show_help() { + echo "Usage: $0 [-d DEVICE_TYPE] [-b ICICLE_BACKEND_INSTALL_DIR]" + echo + echo "Options:" + echo " -d DEVICE_TYPE Specify the device type (default: CPU)" + echo " -b ICICLE_BACKEND_INSTALL_DIR Specify the backend installation directory (default: empty)" + echo " -h Show this help message" + exit 0 +} + +# Parse command line options +while getopts ":d:b:h" opt; do + case ${opt} in + d ) + DEVICE_TYPE=$OPTARG + ;; + b ) + ICICLE_BACKEND_INSTALL_DIR="$(realpath ${OPTARG})" + ;; + h ) + show_help + ;; + \? ) + echo "Invalid option: -$OPTARG" 1>&2 + show_help + ;; + : ) + echo "Invalid option: -$OPTARG requires an argument" 1>&2 + show_help + ;; + esac +done + +# Set default values if not provided +: "${DEVICE_TYPE:=CPU}" +: "${ICICLE_BACKEND_INSTALL_DIR:=}" + +# Create necessary directories +mkdir -p build/example +mkdir -p build/icicle + +ICILE_DIR=$(realpath "../../../icicle/") +ICICLE_CUDA_SOURCE_DIR="${ICILE_DIR}/backend/cuda" + +# Build Icicle and the example app that links to it +if [ "$DEVICE_TYPE" == "CUDA" ] && [ ! -d "${ICICLE_BACKEND_INSTALL_DIR}" ] && [ -d "${ICICLE_CUDA_SOURCE_DIR}" ]; then + echo "Building icicle with CUDA backend" + cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DMSM=OFF -DG2=OFF -DECNTT=OFF -DCUDA_BACKEND=local -S "${ICILE_DIR}" -B build/icicle + export ICICLE_BACKEND_INSTALL_DIR=$(realpath "build/icicle/backend") +else + echo "Building icicle without CUDA backend, ICICLE_BACKEND_INSTALL_DIR=${ICICLE_BACKEND_INSTALL_DIR}" + export ICICLE_BACKEND_INSTALL_DIR="${ICICLE_BACKEND_INSTALL_DIR}" + cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -S "${ICILE_DIR}" -B build/icicle +fi +cmake -DCMAKE_BUILD_TYPE=Release -S . -B build/example + +cmake --build build/icicle -j +cmake --build build/example -j + +./build/example/example "$DEVICE_TYPE" diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index e3f7532aa..2b0114611 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -154,7 +154,9 @@ class VectorOpTask : public TaskBase // Single worker functionality to execute sum(vector) void vector_sum() { + ICICLE_LOG_INFO << "enter vector_sum"; *m_output = m_op_a[0]; + ICICLE_LOG_INFO << "point 1"; for (uint64_t i = 1; i < m_nof_operations; ++i) { *m_output = *m_output + m_op_a[i]; } @@ -242,6 +244,7 @@ class VectorOpTask : public TaskBase int m_bit_size; // use in bitrev operation uint64_t m_stride; // used in slice operation T* m_output; // pointer to the output. Can be a vector or scalar pointer +public: T m_intermidiate_res; // pointer to the output. Can be a vector or scalar pointer }; @@ -339,6 +342,11 @@ cpu_vector_div(const Device& device, const T* vec_a, const T* vec_b, uint64_t n, REGISTER_VECTOR_DIV_BACKEND("CPU", cpu_vector_div); + +// #define SP_DEBUG + +#ifndef SP_DEBUG + /*********************************** SUM ***********************************/ template eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output) @@ -362,8 +370,27 @@ eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, co return eIcicleError::SUCCESS; } +#else + +template +eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output) +{ + *output = scalar_t::zero(); + for (uint64_t i = 0; i < n; ++i) { + *output = *output + vec_a[i]; + } + return eIcicleError::SUCCESS; +} + +#endif + + REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum); -/*********************************** SUM ***********************************/ +/*********************************** PRODUCT ***********************************/ + + +#ifndef SP_DEBUG + template eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output) { @@ -379,13 +406,26 @@ eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n } if (vec_s_offset < n) { task_p->send_intermidiate_res_task( - VecOperation::VECTOR_SUM, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_s_offset), vec_a + vec_s_offset); + VecOperation::VECTOR_PRODUCT, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_s_offset), vec_a + vec_s_offset); vec_s_offset += NOF_OPERATIONS_PER_TASK; } } while (task_p != nullptr); return eIcicleError::SUCCESS; } +#else +template +eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output) +{ + *output = scalar_t::one(); + for (uint64_t i = 0; i < n; ++i) { + *output = *output * vec_a[i]; + } + return eIcicleError::SUCCESS; +} + +#endif + REGISTER_VECTOR_PRODUCT_BACKEND("CPU", cpu_vector_product); /*********************************** MUL BY SCALAR***********************************/ diff --git a/icicle/include/icicle/api/babybear.h b/icicle/include/icicle/api/babybear.h index c0104443e..0e329f4d1 100644 --- a/icicle/include/icicle/api/babybear.h +++ b/icicle/include/icicle/api/babybear.h @@ -9,6 +9,11 @@ #include "icicle/ntt.h" #include "icicle/vec_ops.h" +extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size); + +extern "C" void babybear_scalar_convert_montgomery( + const babybear::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, babybear::scalar_t* output); + extern "C" eIcicleError babybear_ntt_init_domain( babybear::scalar_t* primitive_root, const NTTInitDomainConfig* config); @@ -17,25 +22,14 @@ extern "C" eIcicleError babybear_ntt( extern "C" eIcicleError babybear_ntt_release_domain(); -extern "C" eIcicleError babybear_vector_mul( - const babybear::scalar_t* vec_a, const babybear::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* result); - -extern "C" eIcicleError babybear_vector_add( - const babybear::scalar_t* vec_a, const babybear::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* result); - -extern "C" eIcicleError babybear_vector_sub( - const babybear::scalar_t* vec_a, const babybear::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* result); +extern "C" eIcicleError babybear_extension_ntt( + const babybear::extension_t* input, int size, NTTDir dir, const NTTConfig* config, babybear::extension_t* output); -extern "C" eIcicleError babybear_matrix_transpose( - const babybear::scalar_t* input, - uint32_t nof_rows, - uint32_t nof_cols, - const VecOpsConfig* config, - babybear::scalar_t* output); -extern "C" eIcicleError babybear_bit_reverse( - const babybear::scalar_t* input, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* output); +extern "C" void babybear_extension_generate_scalars(babybear::extension_t* scalars, int size); +extern "C" eIcicleError babybear_extension_scalar_convert_montgomery( + const babybear::extension_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, babybear::extension_t* output); extern "C" eIcicleError babybear_extension_vector_mul( const babybear::extension_t* vec_a, const babybear::extension_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::extension_t* result); @@ -57,17 +51,23 @@ extern "C" eIcicleError babybear_extension_bit_reverse( const babybear::extension_t* input, uint64_t n, const VecOpsConfig* config, babybear::extension_t* output); -extern "C" void babybear_extension_generate_scalars(babybear::extension_t* scalars, int size); +extern "C" eIcicleError babybear_vector_mul( + const babybear::scalar_t* vec_a, const babybear::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* result); -extern "C" eIcicleError babybear_extension_scalar_convert_montgomery( - const babybear::extension_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, babybear::extension_t* output); +extern "C" eIcicleError babybear_vector_add( + const babybear::scalar_t* vec_a, const babybear::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* result); -extern "C" eIcicleError babybear_extension_ntt( - const babybear::extension_t* input, int size, NTTDir dir, const NTTConfig* config, babybear::extension_t* output); +extern "C" eIcicleError babybear_vector_sub( + const babybear::scalar_t* vec_a, const babybear::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* result); +extern "C" eIcicleError babybear_matrix_transpose( + const babybear::scalar_t* input, + uint32_t nof_rows, + uint32_t nof_cols, + const VecOpsConfig* config, + babybear::scalar_t* output); -extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size); +extern "C" eIcicleError babybear_bit_reverse( + const babybear::scalar_t* input, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* output); -extern "C" void babybear_scalar_convert_montgomery( - const babybear::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, babybear::scalar_t* output); diff --git a/icicle/include/icicle/api/bls12_377.h b/icicle/include/icicle/api/bls12_377.h index 8287a5102..c617dcaf9 100644 --- a/icicle/include/icicle/api/bls12_377.h +++ b/icicle/include/icicle/api/bls12_377.h @@ -10,19 +10,19 @@ #include "icicle/msm.h" #include "icicle/vec_ops.h" -extern "C" bool bls12_377_eq(bls12_377::projective_t* point1, bls12_377::projective_t* point2); +extern "C" bool bls12_377_g2_eq(bls12_377::g2_projective_t* point1, bls12_377::g2_projective_t* point2); -extern "C" void bls12_377_to_affine(bls12_377::projective_t* point, bls12_377::affine_t* point_out); +extern "C" void bls12_377_g2_to_affine(bls12_377::g2_projective_t* point, bls12_377::g2_affine_t* point_out); -extern "C" void bls12_377_generate_projective_points(bls12_377::projective_t* points, int size); +extern "C" void bls12_377_g2_generate_projective_points(bls12_377::g2_projective_t* points, int size); -extern "C" void bls12_377_generate_affine_points(bls12_377::affine_t* points, int size); +extern "C" void bls12_377_g2_generate_affine_points(bls12_377::g2_affine_t* points, int size); -extern "C" eIcicleError bls12_377_affine_convert_montgomery( - const bls12_377::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::affine_t* output); +extern "C" eIcicleError bls12_377_g2_affine_convert_montgomery( + const bls12_377::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::g2_affine_t* output); -extern "C" eIcicleError bls12_377_projective_convert_montgomery( - const bls12_377::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::projective_t* output); +extern "C" eIcicleError bls12_377_g2_projective_convert_montgomery( + const bls12_377::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::g2_projective_t* output); extern "C" eIcicleError bls12_377_ecntt( const bls12_377::projective_t* input, int size, NTTDir dir, const NTTConfig* config, bls12_377::projective_t* output); @@ -37,19 +37,19 @@ extern "C" eIcicleError bls12_377_precompute_msm_bases( extern "C" eIcicleError bls12_377_msm( const bls12_377::scalar_t* scalars, const bls12_377::affine_t* points, int msm_size, const MSMConfig* config, bls12_377::projective_t* out); -extern "C" bool bls12_377_g2_eq(bls12_377::g2_projective_t* point1, bls12_377::g2_projective_t* point2); +extern "C" bool bls12_377_eq(bls12_377::projective_t* point1, bls12_377::projective_t* point2); -extern "C" void bls12_377_g2_to_affine(bls12_377::g2_projective_t* point, bls12_377::g2_affine_t* point_out); +extern "C" void bls12_377_to_affine(bls12_377::projective_t* point, bls12_377::affine_t* point_out); -extern "C" void bls12_377_g2_generate_projective_points(bls12_377::g2_projective_t* points, int size); +extern "C" void bls12_377_generate_projective_points(bls12_377::projective_t* points, int size); -extern "C" void bls12_377_g2_generate_affine_points(bls12_377::g2_affine_t* points, int size); +extern "C" void bls12_377_generate_affine_points(bls12_377::affine_t* points, int size); -extern "C" eIcicleError bls12_377_g2_affine_convert_montgomery( - const bls12_377::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::g2_affine_t* output); +extern "C" eIcicleError bls12_377_affine_convert_montgomery( + const bls12_377::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::affine_t* output); -extern "C" eIcicleError bls12_377_g2_projective_convert_montgomery( - const bls12_377::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::g2_projective_t* output); +extern "C" eIcicleError bls12_377_projective_convert_montgomery( + const bls12_377::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::projective_t* output); extern "C" eIcicleError bls12_377_g2_precompute_msm_bases( const bls12_377::g2_affine_t* bases, @@ -60,6 +60,11 @@ extern "C" eIcicleError bls12_377_g2_precompute_msm_bases( extern "C" eIcicleError bls12_377_g2_msm( const bls12_377::scalar_t* scalars, const bls12_377::g2_affine_t* points, int msm_size, const MSMConfig* config, bls12_377::g2_projective_t* out); +extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size); + +extern "C" void bls12_377_scalar_convert_montgomery( + const bls12_377::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bls12_377::scalar_t* output); + extern "C" eIcicleError bls12_377_ntt_init_domain( bls12_377::scalar_t* primitive_root, const NTTInitDomainConfig* config); @@ -88,8 +93,3 @@ extern "C" eIcicleError bls12_377_bit_reverse( const bls12_377::scalar_t* input, uint64_t n, const VecOpsConfig* config, bls12_377::scalar_t* output); -extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size); - -extern "C" void bls12_377_scalar_convert_montgomery( - const bls12_377::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bls12_377::scalar_t* output); - diff --git a/icicle/include/icicle/api/bls12_381.h b/icicle/include/icicle/api/bls12_381.h index d2b7d6999..361731586 100644 --- a/icicle/include/icicle/api/bls12_381.h +++ b/icicle/include/icicle/api/bls12_381.h @@ -10,19 +10,19 @@ #include "icicle/msm.h" #include "icicle/vec_ops.h" -extern "C" bool bls12_381_eq(bls12_381::projective_t* point1, bls12_381::projective_t* point2); +extern "C" bool bls12_381_g2_eq(bls12_381::g2_projective_t* point1, bls12_381::g2_projective_t* point2); -extern "C" void bls12_381_to_affine(bls12_381::projective_t* point, bls12_381::affine_t* point_out); +extern "C" void bls12_381_g2_to_affine(bls12_381::g2_projective_t* point, bls12_381::g2_affine_t* point_out); -extern "C" void bls12_381_generate_projective_points(bls12_381::projective_t* points, int size); +extern "C" void bls12_381_g2_generate_projective_points(bls12_381::g2_projective_t* points, int size); -extern "C" void bls12_381_generate_affine_points(bls12_381::affine_t* points, int size); +extern "C" void bls12_381_g2_generate_affine_points(bls12_381::g2_affine_t* points, int size); -extern "C" eIcicleError bls12_381_affine_convert_montgomery( - const bls12_381::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::affine_t* output); +extern "C" eIcicleError bls12_381_g2_affine_convert_montgomery( + const bls12_381::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::g2_affine_t* output); -extern "C" eIcicleError bls12_381_projective_convert_montgomery( - const bls12_381::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::projective_t* output); +extern "C" eIcicleError bls12_381_g2_projective_convert_montgomery( + const bls12_381::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::g2_projective_t* output); extern "C" eIcicleError bls12_381_ecntt( const bls12_381::projective_t* input, int size, NTTDir dir, const NTTConfig* config, bls12_381::projective_t* output); @@ -37,19 +37,19 @@ extern "C" eIcicleError bls12_381_precompute_msm_bases( extern "C" eIcicleError bls12_381_msm( const bls12_381::scalar_t* scalars, const bls12_381::affine_t* points, int msm_size, const MSMConfig* config, bls12_381::projective_t* out); -extern "C" bool bls12_381_g2_eq(bls12_381::g2_projective_t* point1, bls12_381::g2_projective_t* point2); +extern "C" bool bls12_381_eq(bls12_381::projective_t* point1, bls12_381::projective_t* point2); -extern "C" void bls12_381_g2_to_affine(bls12_381::g2_projective_t* point, bls12_381::g2_affine_t* point_out); +extern "C" void bls12_381_to_affine(bls12_381::projective_t* point, bls12_381::affine_t* point_out); -extern "C" void bls12_381_g2_generate_projective_points(bls12_381::g2_projective_t* points, int size); +extern "C" void bls12_381_generate_projective_points(bls12_381::projective_t* points, int size); -extern "C" void bls12_381_g2_generate_affine_points(bls12_381::g2_affine_t* points, int size); +extern "C" void bls12_381_generate_affine_points(bls12_381::affine_t* points, int size); -extern "C" eIcicleError bls12_381_g2_affine_convert_montgomery( - const bls12_381::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::g2_affine_t* output); +extern "C" eIcicleError bls12_381_affine_convert_montgomery( + const bls12_381::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::affine_t* output); -extern "C" eIcicleError bls12_381_g2_projective_convert_montgomery( - const bls12_381::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::g2_projective_t* output); +extern "C" eIcicleError bls12_381_projective_convert_montgomery( + const bls12_381::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::projective_t* output); extern "C" eIcicleError bls12_381_g2_precompute_msm_bases( const bls12_381::g2_affine_t* bases, @@ -60,6 +60,11 @@ extern "C" eIcicleError bls12_381_g2_precompute_msm_bases( extern "C" eIcicleError bls12_381_g2_msm( const bls12_381::scalar_t* scalars, const bls12_381::g2_affine_t* points, int msm_size, const MSMConfig* config, bls12_381::g2_projective_t* out); +extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size); + +extern "C" void bls12_381_scalar_convert_montgomery( + const bls12_381::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bls12_381::scalar_t* output); + extern "C" eIcicleError bls12_381_ntt_init_domain( bls12_381::scalar_t* primitive_root, const NTTInitDomainConfig* config); @@ -88,8 +93,3 @@ extern "C" eIcicleError bls12_381_bit_reverse( const bls12_381::scalar_t* input, uint64_t n, const VecOpsConfig* config, bls12_381::scalar_t* output); -extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size); - -extern "C" void bls12_381_scalar_convert_montgomery( - const bls12_381::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bls12_381::scalar_t* output); - diff --git a/icicle/include/icicle/api/bn254.h b/icicle/include/icicle/api/bn254.h index d054f23b4..928cb639e 100644 --- a/icicle/include/icicle/api/bn254.h +++ b/icicle/include/icicle/api/bn254.h @@ -10,19 +10,19 @@ #include "icicle/msm.h" #include "icicle/vec_ops.h" -extern "C" bool bn254_eq(bn254::projective_t* point1, bn254::projective_t* point2); +extern "C" bool bn254_g2_eq(bn254::g2_projective_t* point1, bn254::g2_projective_t* point2); -extern "C" void bn254_to_affine(bn254::projective_t* point, bn254::affine_t* point_out); +extern "C" void bn254_g2_to_affine(bn254::g2_projective_t* point, bn254::g2_affine_t* point_out); -extern "C" void bn254_generate_projective_points(bn254::projective_t* points, int size); +extern "C" void bn254_g2_generate_projective_points(bn254::g2_projective_t* points, int size); -extern "C" void bn254_generate_affine_points(bn254::affine_t* points, int size); +extern "C" void bn254_g2_generate_affine_points(bn254::g2_affine_t* points, int size); -extern "C" eIcicleError bn254_affine_convert_montgomery( - const bn254::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::affine_t* output); +extern "C" eIcicleError bn254_g2_affine_convert_montgomery( + const bn254::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::g2_affine_t* output); -extern "C" eIcicleError bn254_projective_convert_montgomery( - const bn254::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::projective_t* output); +extern "C" eIcicleError bn254_g2_projective_convert_montgomery( + const bn254::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::g2_projective_t* output); extern "C" eIcicleError bn254_ecntt( const bn254::projective_t* input, int size, NTTDir dir, const NTTConfig* config, bn254::projective_t* output); @@ -37,19 +37,19 @@ extern "C" eIcicleError bn254_precompute_msm_bases( extern "C" eIcicleError bn254_msm( const bn254::scalar_t* scalars, const bn254::affine_t* points, int msm_size, const MSMConfig* config, bn254::projective_t* out); -extern "C" bool bn254_g2_eq(bn254::g2_projective_t* point1, bn254::g2_projective_t* point2); +extern "C" bool bn254_eq(bn254::projective_t* point1, bn254::projective_t* point2); -extern "C" void bn254_g2_to_affine(bn254::g2_projective_t* point, bn254::g2_affine_t* point_out); +extern "C" void bn254_to_affine(bn254::projective_t* point, bn254::affine_t* point_out); -extern "C" void bn254_g2_generate_projective_points(bn254::g2_projective_t* points, int size); +extern "C" void bn254_generate_projective_points(bn254::projective_t* points, int size); -extern "C" void bn254_g2_generate_affine_points(bn254::g2_affine_t* points, int size); +extern "C" void bn254_generate_affine_points(bn254::affine_t* points, int size); -extern "C" eIcicleError bn254_g2_affine_convert_montgomery( - const bn254::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::g2_affine_t* output); +extern "C" eIcicleError bn254_affine_convert_montgomery( + const bn254::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::affine_t* output); -extern "C" eIcicleError bn254_g2_projective_convert_montgomery( - const bn254::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::g2_projective_t* output); +extern "C" eIcicleError bn254_projective_convert_montgomery( + const bn254::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::projective_t* output); extern "C" eIcicleError bn254_g2_precompute_msm_bases( const bn254::g2_affine_t* bases, @@ -60,6 +60,11 @@ extern "C" eIcicleError bn254_g2_precompute_msm_bases( extern "C" eIcicleError bn254_g2_msm( const bn254::scalar_t* scalars, const bn254::g2_affine_t* points, int msm_size, const MSMConfig* config, bn254::g2_projective_t* out); +extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size); + +extern "C" void bn254_scalar_convert_montgomery( + const bn254::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bn254::scalar_t* output); + extern "C" eIcicleError bn254_ntt_init_domain( bn254::scalar_t* primitive_root, const NTTInitDomainConfig* config); @@ -88,8 +93,3 @@ extern "C" eIcicleError bn254_bit_reverse( const bn254::scalar_t* input, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* output); -extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size); - -extern "C" void bn254_scalar_convert_montgomery( - const bn254::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bn254::scalar_t* output); - diff --git a/icicle/include/icicle/api/bw6_761.h b/icicle/include/icicle/api/bw6_761.h index 31d3b87e2..6b48606a2 100644 --- a/icicle/include/icicle/api/bw6_761.h +++ b/icicle/include/icicle/api/bw6_761.h @@ -10,19 +10,19 @@ #include "icicle/msm.h" #include "icicle/vec_ops.h" -extern "C" bool bw6_761_eq(bw6_761::projective_t* point1, bw6_761::projective_t* point2); +extern "C" bool bw6_761_g2_eq(bw6_761::g2_projective_t* point1, bw6_761::g2_projective_t* point2); -extern "C" void bw6_761_to_affine(bw6_761::projective_t* point, bw6_761::affine_t* point_out); +extern "C" void bw6_761_g2_to_affine(bw6_761::g2_projective_t* point, bw6_761::g2_affine_t* point_out); -extern "C" void bw6_761_generate_projective_points(bw6_761::projective_t* points, int size); +extern "C" void bw6_761_g2_generate_projective_points(bw6_761::g2_projective_t* points, int size); -extern "C" void bw6_761_generate_affine_points(bw6_761::affine_t* points, int size); +extern "C" void bw6_761_g2_generate_affine_points(bw6_761::g2_affine_t* points, int size); -extern "C" eIcicleError bw6_761_affine_convert_montgomery( - const bw6_761::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::affine_t* output); +extern "C" eIcicleError bw6_761_g2_affine_convert_montgomery( + const bw6_761::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::g2_affine_t* output); -extern "C" eIcicleError bw6_761_projective_convert_montgomery( - const bw6_761::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::projective_t* output); +extern "C" eIcicleError bw6_761_g2_projective_convert_montgomery( + const bw6_761::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::g2_projective_t* output); extern "C" eIcicleError bw6_761_ecntt( const bw6_761::projective_t* input, int size, NTTDir dir, const NTTConfig* config, bw6_761::projective_t* output); @@ -37,19 +37,19 @@ extern "C" eIcicleError bw6_761_precompute_msm_bases( extern "C" eIcicleError bw6_761_msm( const bw6_761::scalar_t* scalars, const bw6_761::affine_t* points, int msm_size, const MSMConfig* config, bw6_761::projective_t* out); -extern "C" bool bw6_761_g2_eq(bw6_761::g2_projective_t* point1, bw6_761::g2_projective_t* point2); +extern "C" bool bw6_761_eq(bw6_761::projective_t* point1, bw6_761::projective_t* point2); -extern "C" void bw6_761_g2_to_affine(bw6_761::g2_projective_t* point, bw6_761::g2_affine_t* point_out); +extern "C" void bw6_761_to_affine(bw6_761::projective_t* point, bw6_761::affine_t* point_out); -extern "C" void bw6_761_g2_generate_projective_points(bw6_761::g2_projective_t* points, int size); +extern "C" void bw6_761_generate_projective_points(bw6_761::projective_t* points, int size); -extern "C" void bw6_761_g2_generate_affine_points(bw6_761::g2_affine_t* points, int size); +extern "C" void bw6_761_generate_affine_points(bw6_761::affine_t* points, int size); -extern "C" eIcicleError bw6_761_g2_affine_convert_montgomery( - const bw6_761::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::g2_affine_t* output); +extern "C" eIcicleError bw6_761_affine_convert_montgomery( + const bw6_761::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::affine_t* output); -extern "C" eIcicleError bw6_761_g2_projective_convert_montgomery( - const bw6_761::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::g2_projective_t* output); +extern "C" eIcicleError bw6_761_projective_convert_montgomery( + const bw6_761::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::projective_t* output); extern "C" eIcicleError bw6_761_g2_precompute_msm_bases( const bw6_761::g2_affine_t* bases, @@ -60,6 +60,11 @@ extern "C" eIcicleError bw6_761_g2_precompute_msm_bases( extern "C" eIcicleError bw6_761_g2_msm( const bw6_761::scalar_t* scalars, const bw6_761::g2_affine_t* points, int msm_size, const MSMConfig* config, bw6_761::g2_projective_t* out); +extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size); + +extern "C" void bw6_761_scalar_convert_montgomery( + const bw6_761::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bw6_761::scalar_t* output); + extern "C" eIcicleError bw6_761_ntt_init_domain( bw6_761::scalar_t* primitive_root, const NTTInitDomainConfig* config); @@ -88,8 +93,3 @@ extern "C" eIcicleError bw6_761_bit_reverse( const bw6_761::scalar_t* input, uint64_t n, const VecOpsConfig* config, bw6_761::scalar_t* output); -extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size); - -extern "C" void bw6_761_scalar_convert_montgomery( - const bw6_761::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bw6_761::scalar_t* output); - diff --git a/icicle/include/icicle/api/grumpkin.h b/icicle/include/icicle/api/grumpkin.h index 9908e492b..42b1b2195 100644 --- a/icicle/include/icicle/api/grumpkin.h +++ b/icicle/include/icicle/api/grumpkin.h @@ -9,6 +9,15 @@ #include "icicle/msm.h" #include "icicle/vec_ops.h" +extern "C" eIcicleError grumpkin_precompute_msm_bases( + const grumpkin::affine_t* bases, + int nof_bases, + const MSMConfig* config, + grumpkin::affine_t* output_bases); + +extern "C" eIcicleError grumpkin_msm( + const grumpkin::scalar_t* scalars, const grumpkin::affine_t* points, int msm_size, const MSMConfig* config, grumpkin::projective_t* out); + extern "C" bool grumpkin_eq(grumpkin::projective_t* point1, grumpkin::projective_t* point2); extern "C" void grumpkin_to_affine(grumpkin::projective_t* point, grumpkin::affine_t* point_out); @@ -23,14 +32,10 @@ extern "C" eIcicleError grumpkin_affine_convert_montgomery( extern "C" eIcicleError grumpkin_projective_convert_montgomery( const grumpkin::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, grumpkin::projective_t* output); -extern "C" eIcicleError grumpkin_precompute_msm_bases( - const grumpkin::affine_t* bases, - int nof_bases, - const MSMConfig* config, - grumpkin::affine_t* output_bases); +extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size); -extern "C" eIcicleError grumpkin_msm( - const grumpkin::scalar_t* scalars, const grumpkin::affine_t* points, int msm_size, const MSMConfig* config, grumpkin::projective_t* out); +extern "C" void grumpkin_scalar_convert_montgomery( + const grumpkin::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, grumpkin::scalar_t* output); extern "C" eIcicleError grumpkin_vector_mul( const grumpkin::scalar_t* vec_a, const grumpkin::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, grumpkin::scalar_t* result); @@ -52,8 +57,3 @@ extern "C" eIcicleError grumpkin_bit_reverse( const grumpkin::scalar_t* input, uint64_t n, const VecOpsConfig* config, grumpkin::scalar_t* output); -extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size); - -extern "C" void grumpkin_scalar_convert_montgomery( - const grumpkin::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, grumpkin::scalar_t* output); - diff --git a/icicle/include/icicle/api/stark252.h b/icicle/include/icicle/api/stark252.h index 3bbe9626f..6a8ff1a74 100644 --- a/icicle/include/icicle/api/stark252.h +++ b/icicle/include/icicle/api/stark252.h @@ -9,6 +9,11 @@ #include "icicle/ntt.h" #include "icicle/vec_ops.h" +extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size); + +extern "C" void stark252_scalar_convert_montgomery( + const stark252::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, stark252::scalar_t* output); + extern "C" eIcicleError stark252_ntt_init_domain( stark252::scalar_t* primitive_root, const NTTInitDomainConfig* config); @@ -37,8 +42,3 @@ extern "C" eIcicleError stark252_bit_reverse( const stark252::scalar_t* input, uint64_t n, const VecOpsConfig* config, stark252::scalar_t* output); -extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size); - -extern "C" void stark252_scalar_convert_montgomery( - const stark252::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, stark252::scalar_t* output); - diff --git a/icicle/include/icicle/backend/vec_ops_backend.h b/icicle/include/icicle/backend/vec_ops_backend.h index 74502a9a4..3914e750a 100644 --- a/icicle/include/icicle/backend/vec_ops_backend.h +++ b/icicle/include/icicle/backend/vec_ops_backend.h @@ -7,6 +7,15 @@ using namespace field_config; namespace icicle { /*************************** Backend registration ***************************/ + using scalarVectorReduceOpImpl = std::function; + + + using scalarVectorOpImpl = std::function; + void register_vector_sum(const std::string& deviceType, scalarVectorReduceOpImpl impl); + +#define REGISTER_VECTOR_SUM_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_vec_sum) = []() -> bool { \ + register_vector_sum(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + + void register_vector_product(const std::string& deviceType, scalarVectorReduceOpImpl impl); + +#define REGISTER_VECTOR_PRODUCT_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_vec_product) = []() -> bool { \ + register_vector_product(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + + + void register_vector_add(const std::string& deviceType, scalarVectorOpImpl impl); #define REGISTER_VECTOR_ADD_BACKEND(DEVICE_TYPE, FUNC) \ diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h index f29ccd335..322ed0c81 100644 --- a/icicle/include/icicle/vec_ops.h +++ b/icicle/include/icicle/vec_ops.h @@ -48,11 +48,33 @@ namespace icicle { // Reduction operations + /** + * @brief Computes the product of all elements in a vector. + * + * @tparam T Type of the elements in the vector. + * @param vec_a Input vector. + * @param n Number of elements in the vector. + * @param config Configuration for the operation. + * @param output Output scalar to store the result. + * @return eIcicleError Error code indicating success or failure. + */ + template - eIcicleError vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output); + eIcicleError vector_product(const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output); + + /** + * @brief Computes the sum of all elements in a vector. + * + * @tparam T Type of the elements in the vector. + * @param vec_a Input vector. + * @param n Number of elements in the vector. + * @param config Configuration for the operation. + * @param output Output scalar to store the result. + * @return eIcicleError Error code indicating success or failure. + */ template - eIcicleError vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output); + eIcicleError vector_sum(const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output); // Element-wise vector operations diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp index ad44767a5..e0acd0091 100644 --- a/icicle/src/vec_ops.cpp +++ b/icicle/src/vec_ops.cpp @@ -5,9 +5,13 @@ namespace icicle { /*********************************** REDUCE PRODUCT ************************/ - ICICLE_DISPATCHER_INST(VectorProductDispatcher, vector_product, scalarVectorOpImpl /* @@@ confirm this argument */); + ICICLE_DISPATCHER_INST(VectorProductDispatcher, vector_product, scalarVectorReduceOpImpl); - // TODO: extern "C" for FFI + extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_product)( + const scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, scalar_t* output) + { + return VectorProductDispatcher::execute(vec_a, n, *config, output); + } template <> eIcicleError @@ -17,9 +21,13 @@ namespace icicle { } /*********************************** REDUCE SUM ****************************/ - ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, scalarVectorOpImpl /* @@@ confirm this argument */); + ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, scalarVectorReduceOpImpl ); - // TODO: extern "C" for FFI + extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_sum)( + const scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, scalar_t* output) + { + return VectorSumDispatcher::execute(vec_a, n, *config, output); + } template <> eIcicleError From 04351fbfc217ad6edb2c20583fc77f4c0a6343ed Mon Sep 17 00:00:00 2001 From: Stas Polonsky Date: Mon, 16 Sep 2024 13:44:23 +0000 Subject: [PATCH 03/43] for Miki --- examples/c++/vector-api/example.cpp | 11 ++-------- icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 22 ++++++++++++++++++-- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/examples/c++/vector-api/example.cpp b/examples/c++/vector-api/example.cpp index ca653abe9..953ddbb84 100644 --- a/examples/c++/vector-api/example.cpp +++ b/examples/c++/vector-api/example.cpp @@ -38,7 +38,7 @@ int main(int argc, char** argv) { try_load_and_set_backend_device(argc, argv); - int N_LOG = 20; + int N_LOG = 10; int N = 1 << N_LOG; // on-host data @@ -90,13 +90,6 @@ int main(int argc, char** argv) } END_TIMER(baseline_reduce_sum, "baseline reduce sum took"); - ICICLE_LOG_INFO << "Failed to load "; - std::cout << "ext: " << std::endl; - // d_config.ext = 2; - std::cout << "ext: " << d_config.ext << std::endl; - - // return 0; - START_TIMER(reduce_sum); ICICLE_CHECK(bn254_vector_sum(d_a, N, &d_config, d_out)); END_TIMER(reduce_sum, "reduce sum took"); @@ -106,7 +99,7 @@ int main(int argc, char** argv) std::cout << "d_out: " << d_out[0] << std::endl; - + return 0; START_TIMER(baseline_reduce_product); h_out[0] = scalar_t::one(); diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index 2b0114611..939604e45 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -71,6 +71,7 @@ class VectorOpTask : public TaskBase m_operation = operation; m_nof_operations = nof_operations; m_op_a = op_a; + // SP: where is m_output? dispatch(); } @@ -155,10 +156,14 @@ class VectorOpTask : public TaskBase void vector_sum() { ICICLE_LOG_INFO << "enter vector_sum"; - *m_output = m_op_a[0]; + ICICLE_LOG_INFO << "m_op_a[0]: " << m_op_a[0]; + ICICLE_LOG_INFO << "point 0"; + // *m_output = m_op_a[0]; + m_intermidiate_res = m_op_a[0]; ICICLE_LOG_INFO << "point 1"; for (uint64_t i = 1; i < m_nof_operations; ++i) { - *m_output = *m_output + m_op_a[i]; + // *m_output = *m_output + m_op_a[i]; + m_intermidiate_res = m_intermidiate_res + m_op_a[i]; } } // Single worker functionality to execute product(vector) @@ -351,6 +356,7 @@ REGISTER_VECTOR_DIV_BACKEND("CPU", cpu_vector_div); template eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output) { + ICICLE_LOG_INFO << "cpu_vector_sum"; TasksManager> task_manager(get_nof_workers(config)); bool output_initialized = false; uint64_t vec_s_offset = 0; @@ -359,14 +365,24 @@ eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, co do { task_p = vec_s_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task(); if (task_p->is_completed()) { + ICICLE_LOG_INFO << "task_p->m_intermidiate_res: " << task_p->m_intermidiate_res; *output = output_initialized ? task_p->m_intermidiate_res : *output + task_p->m_intermidiate_res; + // SP: we used m_intermidiate_res, we have to mark it so we can't use it again. set_idle? + // SP: Use dispatch if setting a new task, or set_idle if to just mark the task result as handled. + // output_initialized = true; + // task_p->set_idle(); + ICICLE_LOG_INFO << "after set_idle"; + ICICLE_LOG_INFO << "is_completed: " << task_p->is_completed(); } if (vec_s_offset < n) { + ICICLE_LOG_INFO << "vec_s_offset: " << vec_s_offset; task_p->send_intermidiate_res_task( VecOperation::VECTOR_SUM, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_s_offset), vec_a + vec_s_offset); vec_s_offset += NOF_OPERATIONS_PER_TASK; } + ICICLE_LOG_INFO << "task_p: " << task_p; } while (task_p != nullptr); + // } while (vec_s_offset < n); return eIcicleError::SUCCESS; } @@ -394,6 +410,7 @@ REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum); template eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output) { + ICICLE_LOG_INFO << "cpu_vector_product"; TasksManager> task_manager(get_nof_workers(config)); bool output_initialized = false; uint64_t vec_s_offset = 0; @@ -405,6 +422,7 @@ eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n *output = output_initialized ? task_p->m_intermidiate_res : *output * task_p->m_intermidiate_res; } if (vec_s_offset < n) { + ICICLE_LOG_INFO << "vec_s_offset: " << vec_s_offset; task_p->send_intermidiate_res_task( VecOperation::VECTOR_PRODUCT, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_s_offset), vec_a + vec_s_offset); vec_s_offset += NOF_OPERATIONS_PER_TASK; From f3086d4f38b8e0569faa833450acf671a3809e35 Mon Sep 17 00:00:00 2001 From: Stas Polonsky Date: Mon, 16 Sep 2024 18:40:24 +0000 Subject: [PATCH 04/43] debugged reduction ops --- examples/c++/vector-api/README.md | 26 ++--- examples/c++/vector-api/example.cpp | 34 ++++-- icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 113 ++++++------------- 3 files changed, 72 insertions(+), 101 deletions(-) diff --git a/examples/c++/vector-api/README.md b/examples/c++/vector-api/README.md index 46c556339..120156c9f 100644 --- a/examples/c++/vector-api/README.md +++ b/examples/c++/vector-api/README.md @@ -1,18 +1,16 @@ # Icicle Example: Vector Operations API -TBD - ## Key-Takeaway -Icicle provides polynomial multiplication using the Number Theoretical Transform (NTT), including forward and inverse transforms. - -## Concise Usage Explanation +The Vector Operations API supports the following: -1. Include the necessary headers. -2. Initialize the NTT domain. -3. Prepare and transform the polynomials from host to device memory. -4. Perform pointwise multiplication. -5. Apply the inverse NTT. + - element-wise vector operations (e.g. addition, multiplication) + - vector reduction operations (e.g. sum of elements, product of elements) + - scalar-vector operations (e.g add scalar to vector) + - matrix operations (e.g. transposition) + - miscellaneous operations like bit-reversal and slicing. + + All these operations can be performed on a host or device both synchronously and asynchronously. ## Running the example @@ -25,8 +23,6 @@ Icicle provides polynomial multiplication using the Number Theoretical Transform ## What's in the example -1. Define the size of the example. -2. Initialize input polynomials. -3. Perform Radix-2 or Mixed-Radix NTT. -4. Perform pointwise polynomial multiplication. -5. Apply the inverse NTT. +1. `example_element_wise`: examples of element-wise operations +2. `example_scalar_vector`: examples of scalar-vector operations + diff --git a/examples/c++/vector-api/example.cpp b/examples/c++/vector-api/example.cpp index 953ddbb84..10ee787cb 100644 --- a/examples/c++/vector-api/example.cpp +++ b/examples/c++/vector-api/example.cpp @@ -27,18 +27,23 @@ void random_samples(scalar_t* res, uint32_t count) res[i] = i < 1000 ? scalar_t::rand_host() : res[i - 1000]; } -// void incremental_values(scalar_t* res, uint32_t count) -// { -// for (int i = 0; i < count; i++) { -// res[i] = i ? res[i - 1] + scalar_t::one() : scalar_t::zero(); -// } -// } +void incremental_values(scalar_t* res, uint32_t count) +{ + for (int i = 0; i < count; i++) { + res[i] = i ? res[i - 1] + scalar_t::one() : scalar_t::zero(); + } +} + + +void example_element_wise() { + return; +} int main(int argc, char** argv) { - try_load_and_set_backend_device(argc, argv); + // try_load_and_set_backend_device(argc, argv); - int N_LOG = 10; + int N_LOG = 20; int N = 1 << N_LOG; // on-host data @@ -49,11 +54,19 @@ int main(int argc, char** argv) random_samples(h_a.get(), N ); random_samples(h_b.get(), N ); + // incremental_values(h_a.get(), N ); + // incremental_values(h_b.get(), N ); + // on-device data scalar_t *d_a, *d_b, *d_out; DeviceProperties device_props; ICICLE_CHECK(icicle_get_device_properties(device_props)); + if (!device_props.using_host_memory) { + std::cout << "Device isn't using host memory" << std::endl; + } else { + std::cout << "Device is using host memory" << std::endl; + } ICICLE_CHECK(icicle_malloc((void**)&d_a, sizeof(scalar_t) * N)); ICICLE_CHECK(icicle_malloc((void**)&d_b, sizeof(scalar_t) * N)); @@ -91,7 +104,8 @@ int main(int argc, char** argv) END_TIMER(baseline_reduce_sum, "baseline reduce sum took"); START_TIMER(reduce_sum); - ICICLE_CHECK(bn254_vector_sum(d_a, N, &d_config, d_out)); + ICICLE_CHECK(bn254_vector_sum(d_a, N, &h_config, d_out)); + // ICICLE_CHECK(bn254_vector_sum(d_a, N, &d_config, d_out)); END_TIMER(reduce_sum, "reduce sum took"); @@ -99,7 +113,7 @@ int main(int argc, char** argv) std::cout << "d_out: " << d_out[0] << std::endl; - return 0; + // return 0; START_TIMER(baseline_reduce_product); h_out[0] = scalar_t::one(); diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index 939604e45..a06c12a50 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -155,23 +155,21 @@ class VectorOpTask : public TaskBase // Single worker functionality to execute sum(vector) void vector_sum() { - ICICLE_LOG_INFO << "enter vector_sum"; - ICICLE_LOG_INFO << "m_op_a[0]: " << m_op_a[0]; - ICICLE_LOG_INFO << "point 0"; - // *m_output = m_op_a[0]; + // SP: *m_output = m_op_a[0]; m_intermidiate_res = m_op_a[0]; - ICICLE_LOG_INFO << "point 1"; for (uint64_t i = 1; i < m_nof_operations; ++i) { - // *m_output = *m_output + m_op_a[i]; + // SP: *m_output = *m_output + m_op_a[i]; m_intermidiate_res = m_intermidiate_res + m_op_a[i]; } } // Single worker functionality to execute product(vector) void vector_product() { - *m_output = m_op_a[0]; + // SP: *m_output = m_op_a[0]; + m_intermidiate_res = m_op_a[0]; for (uint64_t i = 1; i < m_nof_operations; ++i) { - *m_output = *m_output * m_op_a[i]; + // SP: *m_output = *m_output * m_op_a[i]; + m_intermidiate_res = m_intermidiate_res * m_op_a[i]; } } // Single worker functionality to execute conversion from barret to montgomery @@ -347,103 +345,66 @@ cpu_vector_div(const Device& device, const T* vec_a, const T* vec_b, uint64_t n, REGISTER_VECTOR_DIV_BACKEND("CPU", cpu_vector_div); - -// #define SP_DEBUG - -#ifndef SP_DEBUG - /*********************************** SUM ***********************************/ template eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output) { - ICICLE_LOG_INFO << "cpu_vector_sum"; TasksManager> task_manager(get_nof_workers(config)); bool output_initialized = false; - uint64_t vec_s_offset = 0; - VectorOpTask* task_p; + uint64_t vec_a_offset = 0; // run until all vector deployed and all tasks completed - do { - task_p = vec_s_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task(); + while (true) { + VectorOpTask* task_p = vec_a_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task(); + if (task_p == nullptr) { + return eIcicleError::SUCCESS; + } if (task_p->is_completed()) { - ICICLE_LOG_INFO << "task_p->m_intermidiate_res: " << task_p->m_intermidiate_res; - *output = output_initialized ? task_p->m_intermidiate_res : *output + task_p->m_intermidiate_res; - // SP: we used m_intermidiate_res, we have to mark it so we can't use it again. set_idle? - // SP: Use dispatch if setting a new task, or set_idle if to just mark the task result as handled. - // output_initialized = true; - // task_p->set_idle(); - ICICLE_LOG_INFO << "after set_idle"; - ICICLE_LOG_INFO << "is_completed: " << task_p->is_completed(); + *output = output_initialized ? *output + task_p->m_intermidiate_res : task_p->m_intermidiate_res; + output_initialized = true; } - if (vec_s_offset < n) { - ICICLE_LOG_INFO << "vec_s_offset: " << vec_s_offset; + if (vec_a_offset < n) { task_p->send_intermidiate_res_task( - VecOperation::VECTOR_SUM, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_s_offset), vec_a + vec_s_offset); - vec_s_offset += NOF_OPERATIONS_PER_TASK; + VecOperation::VECTOR_SUM, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_a_offset), vec_a + vec_a_offset); + vec_a_offset += NOF_OPERATIONS_PER_TASK; + } + else { + task_p->set_idle(); } - ICICLE_LOG_INFO << "task_p: " << task_p; - } while (task_p != nullptr); - // } while (vec_s_offset < n); - return eIcicleError::SUCCESS; -} - -#else - -template -eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output) -{ - *output = scalar_t::zero(); - for (uint64_t i = 0; i < n; ++i) { - *output = *output + vec_a[i]; } - return eIcicleError::SUCCESS; } -#endif - - REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum); -/*********************************** PRODUCT ***********************************/ - - -#ifndef SP_DEBUG +/*********************************** PRODUCT ***********************************/ template eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output) { ICICLE_LOG_INFO << "cpu_vector_product"; TasksManager> task_manager(get_nof_workers(config)); bool output_initialized = false; - uint64_t vec_s_offset = 0; - VectorOpTask* task_p; + uint64_t vec_a_offset = 0; + // run until all vector deployed and all tasks completed - do { - task_p = vec_s_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task(); + while (true) { + VectorOpTask* task_p = vec_a_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task(); + if (task_p == nullptr) { + return eIcicleError::SUCCESS; + } if (task_p->is_completed()) { - *output = output_initialized ? task_p->m_intermidiate_res : *output * task_p->m_intermidiate_res; + *output = output_initialized ? *output * task_p->m_intermidiate_res : task_p->m_intermidiate_res; + output_initialized = true; } - if (vec_s_offset < n) { - ICICLE_LOG_INFO << "vec_s_offset: " << vec_s_offset; + if (vec_a_offset < n) { task_p->send_intermidiate_res_task( - VecOperation::VECTOR_PRODUCT, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_s_offset), vec_a + vec_s_offset); - vec_s_offset += NOF_OPERATIONS_PER_TASK; + VecOperation::VECTOR_PRODUCT, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_a_offset), vec_a + vec_a_offset); + vec_a_offset += NOF_OPERATIONS_PER_TASK; } - } while (task_p != nullptr); - return eIcicleError::SUCCESS; -} - -#else -template -eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output) -{ - *output = scalar_t::one(); - for (uint64_t i = 0; i < n; ++i) { - *output = *output * vec_a[i]; - } - return eIcicleError::SUCCESS; + else { + task_p->set_idle(); + } + } } -#endif - REGISTER_VECTOR_PRODUCT_BACKEND("CPU", cpu_vector_product); /*********************************** MUL BY SCALAR***********************************/ From 2ab44886e67154aa711b31cdbe433b1362d0483e Mon Sep 17 00:00:00 2001 From: Stas Polonsky Date: Mon, 16 Sep 2024 20:29:22 +0000 Subject: [PATCH 05/43] added offset/stride to reduce ops --- examples/c++/vector-api/example.cpp | 8 ++++---- icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 5 +++-- icicle/include/icicle/backend/vec_ops_backend.h | 4 +++- icicle/include/icicle/vec_ops.h | 4 ++-- icicle/src/vec_ops.cpp | 16 ++++++++-------- 5 files changed, 20 insertions(+), 17 deletions(-) diff --git a/examples/c++/vector-api/example.cpp b/examples/c++/vector-api/example.cpp index 10ee787cb..16637d7e1 100644 --- a/examples/c++/vector-api/example.cpp +++ b/examples/c++/vector-api/example.cpp @@ -10,10 +10,10 @@ // SP: I undertstand this code is auto-generated, but I can't get scrip/gen to work. extern "C" eIcicleError bn254_vector_product( - const bn254::scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result); + const bn254::scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result, uint64_t offset, uint64_t stride); extern "C" eIcicleError bn254_vector_sum( - const bn254::scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result); + const bn254::scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result, uint64_t offset, uint64_t stride); // SP: end of my changes @@ -104,7 +104,7 @@ int main(int argc, char** argv) END_TIMER(baseline_reduce_sum, "baseline reduce sum took"); START_TIMER(reduce_sum); - ICICLE_CHECK(bn254_vector_sum(d_a, N, &h_config, d_out)); + ICICLE_CHECK(bn254_vector_sum(d_a, N, &h_config, d_out, 0, 1)); // ICICLE_CHECK(bn254_vector_sum(d_a, N, &d_config, d_out)); END_TIMER(reduce_sum, "reduce sum took"); @@ -124,7 +124,7 @@ int main(int argc, char** argv) START_TIMER(reduce_product); - ICICLE_CHECK(bn254_vector_product(d_a, N, &d_config, d_out)); + ICICLE_CHECK(bn254_vector_product(d_a, N, &d_config, d_out, 0, 1)); END_TIMER(reduce_product, "reduce product took"); diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index a06c12a50..3dba93937 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -346,8 +346,9 @@ cpu_vector_div(const Device& device, const T* vec_a, const T* vec_b, uint64_t n, REGISTER_VECTOR_DIV_BACKEND("CPU", cpu_vector_div); /*********************************** SUM ***********************************/ + template -eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output) +eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output, uint64_t offset, uint64_t stride) { TasksManager> task_manager(get_nof_workers(config)); bool output_initialized = false; @@ -377,7 +378,7 @@ REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum); /*********************************** PRODUCT ***********************************/ template -eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output) +eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output, uint64_t offset, uint64_t stride) { ICICLE_LOG_INFO << "cpu_vector_product"; TasksManager> task_manager(get_nof_workers(config)); diff --git a/icicle/include/icicle/backend/vec_ops_backend.h b/icicle/include/icicle/backend/vec_ops_backend.h index 3914e750a..3ce9271e7 100644 --- a/icicle/include/icicle/backend/vec_ops_backend.h +++ b/icicle/include/icicle/backend/vec_ops_backend.h @@ -12,7 +12,9 @@ namespace icicle { const scalar_t* vec_a, uint64_t n, const VecOpsConfig& config, - scalar_t* output)>; + scalar_t* output, + uint64_t offset, + uint64_t stride)>; diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h index 322ed0c81..e0cf6f7af 100644 --- a/icicle/include/icicle/vec_ops.h +++ b/icicle/include/icicle/vec_ops.h @@ -60,7 +60,7 @@ namespace icicle { */ template - eIcicleError vector_product(const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output); + eIcicleError vector_product(const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output, uint64_t offset, uint64_t stride); /** * @brief Computes the sum of all elements in a vector. @@ -74,7 +74,7 @@ namespace icicle { */ template - eIcicleError vector_sum(const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output); + eIcicleError vector_sum(const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output, uint64_t offset, uint64_t stride); // Element-wise vector operations diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp index e0acd0091..29ab25ba0 100644 --- a/icicle/src/vec_ops.cpp +++ b/icicle/src/vec_ops.cpp @@ -8,32 +8,32 @@ namespace icicle { ICICLE_DISPATCHER_INST(VectorProductDispatcher, vector_product, scalarVectorReduceOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_product)( - const scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, scalar_t* output) + const scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, scalar_t* output, uint64_t offset, uint64_t stride) { - return VectorProductDispatcher::execute(vec_a, n, *config, output); + return VectorProductDispatcher::execute(vec_a, n, *config, output, offset, stride); } template <> eIcicleError - vector_product(const scalar_t* vec_a, uint64_t n, const VecOpsConfig& config, scalar_t* output) + vector_product(const scalar_t* vec_a, uint64_t n, const VecOpsConfig& config, scalar_t* output, uint64_t offset, uint64_t stride) { - return CONCAT_EXPAND(FIELD, vector_product)(vec_a, n, &config, output); + return CONCAT_EXPAND(FIELD, vector_product)(vec_a, n, &config, output, offset, stride); } /*********************************** REDUCE SUM ****************************/ ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, scalarVectorReduceOpImpl ); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_sum)( - const scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, scalar_t* output) + const scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, scalar_t* output, uint64_t offset=0, uint64_t stride=1) { - return VectorSumDispatcher::execute(vec_a, n, *config, output); + return VectorSumDispatcher::execute(vec_a, n, *config, output, offset, stride); } template <> eIcicleError - vector_sum(const scalar_t* vec_a, uint64_t n, const VecOpsConfig& config, scalar_t* output) + vector_sum(const scalar_t* vec_a, uint64_t n, const VecOpsConfig& config, scalar_t* output, uint64_t offset, uint64_t stride) { - return CONCAT_EXPAND(FIELD, vector_sum)(vec_a, n, &config, output); + return CONCAT_EXPAND(FIELD, vector_sum)(vec_a, n, &config, output, offset, stride); } /*********************************** ADD ***********************************/ From 89e998ae2c7426e1843a6a9cf3f79f961c6cd4a6 Mon Sep 17 00:00:00 2001 From: Stas Polonsky Date: Tue, 17 Sep 2024 21:42:42 +0000 Subject: [PATCH 06/43] implemented strides ops --- examples/c++/vector-api/example.cpp | 15 ++++---- icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 36 ++++++++++---------- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/examples/c++/vector-api/example.cpp b/examples/c++/vector-api/example.cpp index 16637d7e1..2a998c5c7 100644 --- a/examples/c++/vector-api/example.cpp +++ b/examples/c++/vector-api/example.cpp @@ -30,7 +30,7 @@ void random_samples(scalar_t* res, uint32_t count) void incremental_values(scalar_t* res, uint32_t count) { for (int i = 0; i < count; i++) { - res[i] = i ? res[i - 1] + scalar_t::one() : scalar_t::zero(); + res[i] = i ? res[i - 1] + scalar_t::one() : scalar_t::one(); } } @@ -45,6 +45,8 @@ int main(int argc, char** argv) int N_LOG = 20; int N = 1 << N_LOG; + int offset = 1; + int stride = 4; // on-host data auto h_a = std::make_unique(N); @@ -98,14 +100,13 @@ int main(int argc, char** argv) START_TIMER(baseline_reduce_sum); h_out[0] = scalar_t::zero(); - for (uint64_t i = 0; i < N; ++i) { + for (uint64_t i = offset; i < N; i=i+stride) { h_out[0] = h_out[0] + h_a[i]; } END_TIMER(baseline_reduce_sum, "baseline reduce sum took"); START_TIMER(reduce_sum); - ICICLE_CHECK(bn254_vector_sum(d_a, N, &h_config, d_out, 0, 1)); - // ICICLE_CHECK(bn254_vector_sum(d_a, N, &d_config, d_out)); + ICICLE_CHECK(bn254_vector_sum(d_a, N, &h_config, d_out, offset, stride)); END_TIMER(reduce_sum, "reduce sum took"); @@ -113,18 +114,16 @@ int main(int argc, char** argv) std::cout << "d_out: " << d_out[0] << std::endl; - // return 0; - START_TIMER(baseline_reduce_product); h_out[0] = scalar_t::one(); - for (uint64_t i = 0; i < N; ++i) { + for (uint64_t i = offset; i < N; i = i + stride) { h_out[0] = h_out[0] * h_a[i]; } END_TIMER(baseline_reduce_product, "baseline reduce product took"); START_TIMER(reduce_product); - ICICLE_CHECK(bn254_vector_product(d_a, N, &d_config, d_out, 0, 1)); + ICICLE_CHECK(bn254_vector_product(d_a, N, &d_config, d_out, offset, stride)); END_TIMER(reduce_product, "reduce product took"); diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index 3dba93937..48feb49ca 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -66,12 +66,12 @@ class VectorOpTask : public TaskBase dispatch(); } // Set the operands to execute a task of 1 operand and dispatch the task - void send_intermidiate_res_task(VecOperation operation, const int nof_operations, const T* op_a) + void send_intermidiate_res_task(VecOperation operation, const uint64_t stop_index, const T* op_a, const uint64_t stride) { m_operation = operation; - m_nof_operations = nof_operations; + m_stop_index = stop_index; m_op_a = op_a; - // SP: where is m_output? + m_stride = stride; dispatch(); } @@ -155,20 +155,16 @@ class VectorOpTask : public TaskBase // Single worker functionality to execute sum(vector) void vector_sum() { - // SP: *m_output = m_op_a[0]; - m_intermidiate_res = m_op_a[0]; - for (uint64_t i = 1; i < m_nof_operations; ++i) { - // SP: *m_output = *m_output + m_op_a[i]; + m_intermidiate_res = T::zero(); + for (uint64_t i = 0; i < m_stop_index; i = i + m_stride) { m_intermidiate_res = m_intermidiate_res + m_op_a[i]; } } // Single worker functionality to execute product(vector) void vector_product() { - // SP: *m_output = m_op_a[0]; - m_intermidiate_res = m_op_a[0]; - for (uint64_t i = 1; i < m_nof_operations; ++i) { - // SP: *m_output = *m_output * m_op_a[i]; + m_intermidiate_res = T::one(); + for (uint64_t i = 0; i < m_stop_index; i = i + m_stride) { m_intermidiate_res = m_intermidiate_res * m_op_a[i]; } } @@ -244,6 +240,7 @@ class VectorOpTask : public TaskBase const T* m_op_a; // pointer to operand A. Operand A is a vector. const T* m_op_b; // pointer to operand B. Operand B is a vector or scalar uint64_t m_start_index; // index used in bitreverse + uint64_t m_stop_index; // index used in reduce operations int m_bit_size; // use in bitrev operation uint64_t m_stride; // used in slice operation T* m_output; // pointer to the output. Can be a vector or scalar pointer @@ -352,7 +349,9 @@ eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, co { TasksManager> task_manager(get_nof_workers(config)); bool output_initialized = false; - uint64_t vec_a_offset = 0; + uint64_t vec_a_offset = offset; + assert(stride > 0); + const uint64_t slice_length = stride * NOF_OPERATIONS_PER_TASK; // run until all vector deployed and all tasks completed while (true) { VectorOpTask* task_p = vec_a_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task(); @@ -365,8 +364,8 @@ eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, co } if (vec_a_offset < n) { task_p->send_intermidiate_res_task( - VecOperation::VECTOR_SUM, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_a_offset), vec_a + vec_a_offset); - vec_a_offset += NOF_OPERATIONS_PER_TASK; + VecOperation::VECTOR_SUM, std::min( slice_length , n - vec_a_offset), vec_a + vec_a_offset, stride); + vec_a_offset += slice_length; } else { task_p->set_idle(); @@ -380,10 +379,11 @@ REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum); template eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output, uint64_t offset, uint64_t stride) { - ICICLE_LOG_INFO << "cpu_vector_product"; TasksManager> task_manager(get_nof_workers(config)); bool output_initialized = false; - uint64_t vec_a_offset = 0; + uint64_t vec_a_offset = offset; + assert(stride > 0); + const uint64_t slice_length = stride * NOF_OPERATIONS_PER_TASK; // run until all vector deployed and all tasks completed while (true) { @@ -397,8 +397,8 @@ eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n } if (vec_a_offset < n) { task_p->send_intermidiate_res_task( - VecOperation::VECTOR_PRODUCT, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_a_offset), vec_a + vec_a_offset); - vec_a_offset += NOF_OPERATIONS_PER_TASK; + VecOperation::VECTOR_PRODUCT, std::min(slice_length, n - vec_a_offset), vec_a + vec_a_offset, stride); + vec_a_offset += slice_length; } else { task_p->set_idle(); From 9aaf944ff5286014a5eb747f4dc8f61de3d26b02 Mon Sep 17 00:00:00 2001 From: Shanie Winitz Date: Wed, 9 Oct 2024 09:14:50 +0300 Subject: [PATCH 07/43] vec_ops batch added --- icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 609 ++++++++++++------ .../include/icicle/backend/vec_ops_backend.h | 45 +- .../default_backend/default_poly_backend.h | 12 +- icicle/include/icicle/vec_ops.h | 328 ++++++---- icicle/src/vec_ops.cpp | 164 ++--- 5 files changed, 752 insertions(+), 406 deletions(-) diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index 48feb49ca..952f5108f 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -6,6 +6,9 @@ #include "icicle/fields/field_config.h" #include "tasks_manager.h" +#include +#include +#include using namespace field_config; using namespace icicle; @@ -18,14 +21,15 @@ enum VecOperation { VECTOR_MUL, VECTOR_DIV, VECTOR_SUM, + CONVERT_TO_MONTGOMERY, + CONVERT_FROM_MONTGOMERY, VECTOR_PRODUCT, SCALAR_ADD_VEC, SCALAR_SUB_VEC, SCALAR_MUL_VEC, - CONVERT_TO_MONTGOMERY, - CONVERT_FROM_MONTGOMERY, BIT_REVERSE, SLICE, + REPLACE_ELEMENTS, NOF_OPERATIONS }; @@ -46,18 +50,19 @@ class VectorOpTask : public TaskBase VectorOpTask() : TaskBase() {} // Set the operands to execute a task of 2 operands and 1 output and dispatch the task - void send_2ops_task(VecOperation operation, const int nof_operations, const T* op_a, const T* op_b, T* output) + void send_2ops_task(VecOperation operation, const uint32_t nof_operations, const T* op_a, const T* op_b, const uint32_t stride , T* output) { m_operation = operation; m_nof_operations = nof_operations; m_op_a = op_a; m_op_b = op_b; + m_stride = stride; m_output = output; dispatch(); } // Set the operands to execute a task of 1 operand and 1 output and dispatch the task - void send_1op_task(VecOperation operation, const int nof_operations, const T* op_a, T* output) + void send_1op_task(VecOperation operation, const uint32_t nof_operations, const T* op_a, T* output) { m_operation = operation; m_nof_operations = nof_operations; @@ -75,29 +80,48 @@ class VectorOpTask : public TaskBase dispatch(); } - // Set the operands to bitrev operation dispatch the task + // Set the operands for bitrev operation and dispatch the task void send_bitrev_task( - VecOperation operation, int bit_size, uint64_t start_index, const int nof_operations, const T* op_a, T* output) + VecOperation operation, uint32_t bit_size, uint64_t start_index, const uint32_t nof_operations, const T* op_a, const uint64_t stride, T* output) { m_operation = operation; + m_bit_size = bit_size; + m_start_index = start_index; m_nof_operations = nof_operations; m_op_a = op_a; + m_stride = stride; m_output = output; - m_bit_size = bit_size, m_start_index = start_index; dispatch(); } - // Set the operands to slice operation dispatch the task - void send_slice_task(VecOperation operation, uint64_t stride, const int nof_operations, const T* op_a, T* output) + // Set the operands for slice operation and dispatch the task + void send_slice_task(VecOperation operation, uint64_t stride, uint64_t stride_out, const uint32_t nof_operations, const T* op_a, T* output) { m_operation = operation; m_nof_operations = nof_operations; m_op_a = op_a; m_output = output; m_stride = stride; + m_stride_out = stride_out; + dispatch(); + } + + // Set the operands for replace_elements operation and dispatch the task + void send_replace_elements_task(VecOperation operation, const T* mat_in, const uint32_t nof_operations, std::vector& start_indices_in_mat, uint64_t start_index, uint32_t log_nof_rows, uint32_t log_nof_cols, const uint32_t stride, T* mat_out) + { + m_operation = operation; + m_op_a = mat_in; + m_nof_operations = nof_operations; + m_start_indices_in_mat = &start_indices_in_mat; + m_start_index = start_index; //start index in start_indices vector + m_log_nof_rows = log_nof_rows; + m_log_nof_cols = log_nof_cols; + m_stride = stride; + m_output = mat_out; dispatch(); } + // Execute the selected function based on m_operation virtual void execute() { (this->*functionPtrs[static_cast(m_operation)])(); } @@ -131,56 +155,55 @@ class VectorOpTask : public TaskBase m_output[i] = m_op_a[i] * T::inverse(m_op_b[i]); } } - // Single worker functionality to execute scalar + vector - void scalar_add_vec() - { - for (uint64_t i = 0; i < m_nof_operations; ++i) { - m_output[i] = *m_op_a + m_op_b[i]; - } - } - // Single worker functionality to execute scalar - vector - void scalar_sub_vec() + // Single worker functionality to execute conversion from barret to montgomery + void convert_to_montgomery() { for (uint64_t i = 0; i < m_nof_operations; ++i) { - m_output[i] = *m_op_a + m_op_b[i]; + m_output[i] = T::to_montgomery(m_op_a[i]); } } - // Single worker functionality to execute scalar * vector - void scalar_mul_vec() + // Single worker functionality to execute conversion from montgomery to barret + void convert_from_montgomery() { for (uint64_t i = 0; i < m_nof_operations; ++i) { - m_output[i] = *m_op_a * m_op_b[i]; + m_output[i] = T::from_montgomery(m_op_a[i]); } } // Single worker functionality to execute sum(vector) void vector_sum() { - m_intermidiate_res = T::zero(); - for (uint64_t i = 0; i < m_stop_index; i = i + m_stride) { - m_intermidiate_res = m_intermidiate_res + m_op_a[i]; + m_intermidiate_res[m_idx_in_batch] = T::zero(); + for (uint64_t i = 0; i < (m_stop_index * m_stride); i = i + m_stride) { + m_intermidiate_res[m_idx_in_batch] = m_intermidiate_res[m_idx_in_batch] + m_op_a[i]; } } // Single worker functionality to execute product(vector) void vector_product() { - m_intermidiate_res = T::one(); - for (uint64_t i = 0; i < m_stop_index; i = i + m_stride) { - m_intermidiate_res = m_intermidiate_res * m_op_a[i]; + m_intermidiate_res[m_idx_in_batch] = T::one(); + for (uint64_t i = 0; i < (m_stop_index * m_stride); i = i + m_stride) { + m_intermidiate_res[m_idx_in_batch] = m_intermidiate_res[m_idx_in_batch] * m_op_a[i]; } } - // Single worker functionality to execute conversion from barret to montgomery - void convert_to_montgomery() + // Single worker functionality to execute scalar + vector + void scalar_add_vec() { for (uint64_t i = 0; i < m_nof_operations; ++i) { - m_output[i] = T::to_montgomery(m_op_a[i]); + m_output[m_stride * i] = *m_op_a + m_op_b[m_stride * i]; } } - - // Single worker functionality to execute conversion from montgomery to barret - void convert_from_montgomery() + // Single worker functionality to execute scalar - vector + void scalar_sub_vec() { for (uint64_t i = 0; i < m_nof_operations; ++i) { - m_output[i] = T::from_montgomery(m_op_a[i]); + m_output[m_stride * i] = *m_op_a - m_op_b[m_stride * i]; + } + } + // Single worker functionality to execute scalar * vector + void scalar_mul_vec() + { + for (uint64_t i = 0; i < m_nof_operations; ++i) { + m_output[m_stride * i] = *m_op_a * m_op_b[m_stride * i]; } } // Single worker functionality to execute bit reverse reorder @@ -201,10 +224,10 @@ class VectorOpTask : public TaskBase if (m_output == m_op_a) { // inplace calculation if (rev_idx < idx) { // only on of the threads need to work - std::swap(m_output[idx], m_output[rev_idx]); + std::swap(m_output[m_stride*idx], m_output[m_stride*rev_idx]); } } else { // out of place calculation - m_output[idx] = m_op_a[rev_idx]; // set index value + m_output[m_stride*idx] = m_op_a[m_stride*rev_idx]; // set index value } } } @@ -213,10 +236,40 @@ class VectorOpTask : public TaskBase void slice() { for (uint64_t i = 0; i < m_nof_operations; ++i) { - m_output[i] = m_op_a[i * m_stride]; + m_output[i * m_stride_out] = m_op_a[i * m_stride]; } } + // Function to perform modulus with Mersenne number + uint64_t mersenne_mod(uint64_t shifted_idx, uint32_t total_bits) { + uint64_t mod = (1ULL << total_bits) - 1; + shifted_idx = (shifted_idx & mod) + (shifted_idx >> total_bits); + while (shifted_idx >= mod) { + shifted_idx = (shifted_idx & mod) + (shifted_idx >> total_bits); + } + // If shifted_idx == mod, result should be 0 since mod % mod == 0 + if (shifted_idx == mod) shifted_idx = 0; //TODO SHANIE - check if redundant + return shifted_idx; + } + + + // Single worker functionality to execute replace elements + void replace_elements() + { + const uint32_t total_bits = m_log_nof_rows + m_log_nof_cols; + for (uint32_t i = 0; i < m_nof_operations; ++i) { + uint64_t start_idx = (*m_start_indices_in_mat)[m_start_index + i]; + uint64_t idx = start_idx; + do { + uint64_t shifted_idx = idx << m_log_nof_rows; + uint64_t new_idx = mersenne_mod(shifted_idx, total_bits); + m_output[m_stride * new_idx] = m_op_a[m_stride * idx]; + idx = new_idx; + } while (idx != start_idx); + } + } + + // An array of available function pointers arranged according to the VecOperation enum using FunctionPtr = void (VectorOpTask::*)(); static constexpr std::array(NOF_OPERATIONS)> functionPtrs = { @@ -224,29 +277,36 @@ class VectorOpTask : public TaskBase &VectorOpTask::vector_sub, // VECTOR_SUB, &VectorOpTask::vector_mul, // VECTOR_MUL, &VectorOpTask::vector_div, // VECTOR_DIV, + &VectorOpTask::convert_to_montgomery, // CONVERT_TO_MONTGOMERY, + &VectorOpTask::convert_from_montgomery, // CONVERT_FROM_MONTGOMERY, &VectorOpTask::vector_sum, // VECTOR_SUM &VectorOpTask::vector_product, // VECTOR_PRODUCT &VectorOpTask::scalar_add_vec, // SCALAR_ADD_VEC, &VectorOpTask::scalar_sub_vec, // SCALAR_SUB_VEC, &VectorOpTask::scalar_mul_vec, // SCALAR_MUL_VEC, - &VectorOpTask::convert_to_montgomery, // CONVERT_TO_MONTGOMERY, - &VectorOpTask::convert_from_montgomery, // CONVERT_FROM_MONTGOMERY, &VectorOpTask::bit_reverse, // BIT_REVERSE - &VectorOpTask::slice // SLICE + &VectorOpTask::slice, // SLICE + &VectorOpTask::replace_elements // REPLACE_ELEMENTS }; VecOperation m_operation; // the operation to execute - int m_nof_operations; // number of operations to execute for this task - const T* m_op_a; // pointer to operand A. Operand A is a vector. + uint32_t m_nof_operations; // number of operations to execute for this task + const T* m_op_a; // pointer to operand A. Operand A is a vector, or metrix in case of replace_elements const T* m_op_b; // pointer to operand B. Operand B is a vector or scalar - uint64_t m_start_index; // index used in bitreverse + uint64_t m_start_index; // index used in bitreverse operation uint64_t m_stop_index; // index used in reduce operations - int m_bit_size; // use in bitrev operation - uint64_t m_stride; // used in slice operation - T* m_output; // pointer to the output. Can be a vector or scalar pointer + uint32_t m_bit_size; // use in bitrev operation + uint64_t m_stride; // used to support column batch operations + uint64_t m_stride_out; // used in slice operation + T* m_output; // pointer to the output. Can be a vector, scalar pointer, or a matrix pointer in case of replace_elements + uint32_t m_log_nof_rows; // log of the number of rows in the matrix, used in replace_elements + uint32_t m_log_nof_cols; // log of the number of columns in the matrix, used in replace_elements + const std::vector* m_start_indices_in_mat; // Indices used in replace_elements operations + public: - T m_intermidiate_res; // pointer to the output. Can be a vector or scalar pointer -}; + T* m_intermidiate_res; // pointer to the output. Can be a vector or scalar pointer + uint64_t m_idx_in_batch; // index in the batch. Used in intermidiate res tasks +}; // class VectorOpTask #define NOF_OPERATIONS_PER_TASK 512 #define CONFIG_NOF_THREADS_KEY "n_threads" @@ -263,12 +323,13 @@ int get_nof_workers(const VecOpsConfig& config) // Execute a full task from the type vector = vector (op) vector template eIcicleError -cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output) +cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { TasksManager> task_manager(get_nof_workers(config)); - for (uint64_t i = 0; i < n; i += NOF_OPERATIONS_PER_TASK) { + const uint64_t total_nof_operations = size*config.batch_size; + for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) { VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); - task_p->send_2ops_task(op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - i), vec_a + i, vec_b + i, output + i); + task_p->send_2ops_task(op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), vec_a + i, vec_b + i, 1, output + i); } task_manager.wait_done(); return eIcicleError::SUCCESS; @@ -277,12 +338,22 @@ cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t n, con // Execute a full task from the type vector = scalar (op) vector template eIcicleError cpu_scalar_vector_op( - VecOperation op, const T* scalar_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output) + VecOperation op, const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output) { TasksManager> task_manager(get_nof_workers(config)); - for (uint64_t i = 0; i < n; i += NOF_OPERATIONS_PER_TASK) { - VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); - task_p->send_2ops_task(op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - i), scalar_a, vec_b + i, output + i); + const uint64_t total_nof_operations = use_single_scalar? size*config.batch_size : size; + const uint32_t stride = (!use_single_scalar && config.columns_batch)? config.batch_size : 1; + for (uint32_t idx_in_batch = 0; idx_in_batch < (use_single_scalar? 1 : config.batch_size); idx_in_batch++) { + for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) { + VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); + task_p->send_2ops_task( + op, + std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), + scalar_a + idx_in_batch, + (!use_single_scalar && config.columns_batch)? vec_b + idx_in_batch + i*config.batch_size : vec_b + idx_in_batch*size + i, + stride, + (!use_single_scalar && config.columns_batch)? output + idx_in_batch + i*config.batch_size : output + idx_in_batch*size + i); + } } task_manager.wait_done(); return eIcicleError::SUCCESS; @@ -290,11 +361,12 @@ eIcicleError cpu_scalar_vector_op( /////////////////////////////////////////////////////// // Functions to register at the CPU backend +/*********************************** ADD ***********************************/ template eIcicleError -cpu_vector_add(const Device& device, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output) +cpu_vector_add(const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { - return cpu_2vectors_op(VecOperation::VECTOR_ADD, vec_a, vec_b, n, config, output); + return cpu_2vectors_op(VecOperation::VECTOR_ADD, vec_a, vec_b, size, config, output); } REGISTER_VECTOR_ADD_BACKEND("CPU", cpu_vector_add); @@ -302,12 +374,9 @@ REGISTER_VECTOR_ADD_BACKEND("CPU", cpu_vector_add); /*********************************** ACCUMULATE ***********************************/ template eIcicleError -cpu_vector_accumulate(const Device& device, T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config) +cpu_vector_accumulate(const Device& device, T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config) { - for (uint64_t i = 0; i < n; ++i) { - vec_a[i] = vec_a[i] + vec_b[i]; - } - return eIcicleError::SUCCESS; + return cpu_2vectors_op(VecOperation::VECTOR_ADD, vec_a, vec_b, size, config, vec_a); } REGISTER_VECTOR_ACCUMULATE_BACKEND("CPU", cpu_vector_accumulate); @@ -315,9 +384,9 @@ REGISTER_VECTOR_ACCUMULATE_BACKEND("CPU", cpu_vector_accumulate); /*********************************** SUB ***********************************/ template eIcicleError -cpu_vector_sub(const Device& device, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output) +cpu_vector_sub(const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { - return cpu_2vectors_op(VecOperation::VECTOR_SUB, vec_a, vec_b, n, config, output); + return cpu_2vectors_op(VecOperation::VECTOR_SUB, vec_a, vec_b, size, config, output); } REGISTER_VECTOR_SUB_BACKEND("CPU", cpu_vector_sub); @@ -325,9 +394,9 @@ REGISTER_VECTOR_SUB_BACKEND("CPU", cpu_vector_sub); /*********************************** MUL ***********************************/ template eIcicleError -cpu_vector_mul(const Device& device, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output) +cpu_vector_mul(const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { - return cpu_2vectors_op(VecOperation::VECTOR_MUL, vec_a, vec_b, n, config, output); + return cpu_2vectors_op(VecOperation::VECTOR_MUL, vec_a, vec_b, size, config, output); } REGISTER_VECTOR_MUL_BACKEND("CPU", cpu_vector_mul); @@ -335,37 +404,71 @@ REGISTER_VECTOR_MUL_BACKEND("CPU", cpu_vector_mul); /*********************************** DIV ***********************************/ template eIcicleError -cpu_vector_div(const Device& device, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output) +cpu_vector_div(const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { - return cpu_2vectors_op(VecOperation::VECTOR_DIV, vec_a, vec_b, n, config, output); + return cpu_2vectors_op(VecOperation::VECTOR_DIV, vec_a, vec_b, size, config, output); } REGISTER_VECTOR_DIV_BACKEND("CPU", cpu_vector_div); +/*********************************** CONVERT MONTGOMERY ***********************************/ +template +eIcicleError cpu_convert_montgomery( + const Device& device, const T* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, T* output) +{ + TasksManager> task_manager(get_nof_workers(config)); + const uint64_t total_nof_operations = size*config.batch_size; + for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) { + VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); + task_p->send_1op_task( + is_to_montgomery ? CONVERT_TO_MONTGOMERY : CONVERT_FROM_MONTGOMERY, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), + input + i, output + i); + } + task_manager.wait_done(); + return eIcicleError::SUCCESS; +} + +REGISTER_CONVERT_MONTGOMERY_BACKEND("CPU", cpu_convert_montgomery); + +#ifdef EXT_FIELD +REGISTER_VECTOR_ADD_EXT_FIELD_BACKEND("CPU", cpu_vector_add); +REGISTER_VECTOR_ACCUMULATE_EXT_FIELD_BACKEND("CPU", cpu_vector_accumulate); +REGISTER_VECTOR_SUB_EXT_FIELD_BACKEND("CPU", cpu_vector_sub); +REGISTER_VECTOR_MUL_EXT_FIELD_BACKEND("CPU", cpu_vector_mul); +REGISTER_CONVERT_MONTGOMERY_EXT_FIELD_BACKEND("CPU", cpu_convert_montgomery); +#endif // EXT_FIELD + /*********************************** SUM ***********************************/ template -eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output, uint64_t offset, uint64_t stride) +eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output) { TasksManager> task_manager(get_nof_workers(config)); - bool output_initialized = false; - uint64_t vec_a_offset = offset; - assert(stride > 0); - const uint64_t slice_length = stride * NOF_OPERATIONS_PER_TASK; + std::vector output_initialized = std::vector(config.batch_size, false); + uint64_t vec_a_offset = 0; + uint64_t idx_in_batch = 0; // run until all vector deployed and all tasks completed while (true) { - VectorOpTask* task_p = vec_a_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task(); + VectorOpTask* task_p = vec_a_offset < size ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task(); if (task_p == nullptr) { return eIcicleError::SUCCESS; } if (task_p->is_completed()) { - *output = output_initialized ? *output + task_p->m_intermidiate_res : task_p->m_intermidiate_res; - output_initialized = true; + output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] ? output[task_p->m_idx_in_batch] + task_p->m_intermidiate_res[task_p->m_idx_in_batch] : task_p->m_intermidiate_res[task_p->m_idx_in_batch]; + output_initialized[task_p->m_idx_in_batch] = true; } - if (vec_a_offset < n) { + if (vec_a_offset < size) { + task_p->m_idx_in_batch = idx_in_batch; task_p->send_intermidiate_res_task( - VecOperation::VECTOR_SUM, std::min( slice_length , n - vec_a_offset), vec_a + vec_a_offset, stride); - vec_a_offset += slice_length; + VecOperation::VECTOR_SUM, + std::min((uint64_t)NOF_OPERATIONS_PER_TASK , size - vec_a_offset), + config.columns_batch? vec_a + idx_in_batch + vec_a_offset*config.batch_size : vec_a + idx_in_batch*size + vec_a_offset, + config.columns_batch? config.batch_size : 1); + idx_in_batch++; + if (idx_in_batch == config.batch_size) { + vec_a_offset += NOF_OPERATIONS_PER_TASK; + idx_in_batch = 0; + } } else { task_p->set_idle(); @@ -377,53 +480,49 @@ REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum); /*********************************** PRODUCT ***********************************/ template -eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output, uint64_t offset, uint64_t stride) +eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output) { TasksManager> task_manager(get_nof_workers(config)); - bool output_initialized = false; - uint64_t vec_a_offset = offset; - assert(stride > 0); - const uint64_t slice_length = stride * NOF_OPERATIONS_PER_TASK; - + std::vector output_initialized = std::vector(config.batch_size, false); + uint64_t vec_a_offset = 0; + uint64_t idx_in_batch = 0; // run until all vector deployed and all tasks completed while (true) { - VectorOpTask* task_p = vec_a_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task(); + VectorOpTask* task_p = vec_a_offset < size ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task(); if (task_p == nullptr) { return eIcicleError::SUCCESS; } if (task_p->is_completed()) { - *output = output_initialized ? *output * task_p->m_intermidiate_res : task_p->m_intermidiate_res; - output_initialized = true; + output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] ? output[task_p->m_idx_in_batch] + task_p->m_intermidiate_res[task_p->m_idx_in_batch] : task_p->m_intermidiate_res[task_p->m_idx_in_batch]; + output_initialized[task_p->m_idx_in_batch] = true; } - if (vec_a_offset < n) { + if (vec_a_offset < size) { + task_p->m_idx_in_batch = idx_in_batch; task_p->send_intermidiate_res_task( - VecOperation::VECTOR_PRODUCT, std::min(slice_length, n - vec_a_offset), vec_a + vec_a_offset, stride); - vec_a_offset += slice_length; + VecOperation::VECTOR_PRODUCT, + std::min((uint64_t)NOF_OPERATIONS_PER_TASK , size - vec_a_offset), + config.columns_batch? vec_a + idx_in_batch + vec_a_offset*config.batch_size : vec_a + idx_in_batch*size + vec_a_offset, + config.columns_batch? config.batch_size : 1); + idx_in_batch++; + if (idx_in_batch == config.batch_size) { + vec_a_offset += NOF_OPERATIONS_PER_TASK; + idx_in_batch = 0; + } } else { task_p->set_idle(); } - } + } } REGISTER_VECTOR_PRODUCT_BACKEND("CPU", cpu_vector_product); -/*********************************** MUL BY SCALAR***********************************/ -template -eIcicleError cpu_scalar_mul( - const Device& device, const T* scalar_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output) -{ - return cpu_scalar_vector_op(VecOperation::SCALAR_MUL_VEC, scalar_a, vec_b, n, config, output); -} - -REGISTER_SCALAR_MUL_VEC_BACKEND("CPU", cpu_scalar_mul); - /*********************************** Scalar + Vector***********************************/ template eIcicleError cpu_scalar_add( - const Device& device, const T* scalar_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output) + const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output) { - return cpu_scalar_vector_op(VecOperation::SCALAR_ADD_VEC, scalar_a, vec_b, n, config, output); + return cpu_scalar_vector_op(VecOperation::SCALAR_ADD_VEC, scalar_a, vec_b, size, use_single_scalar, config, output); } REGISTER_SCALAR_ADD_VEC_BACKEND("CPU", cpu_scalar_add); @@ -431,60 +530,161 @@ REGISTER_SCALAR_ADD_VEC_BACKEND("CPU", cpu_scalar_add); /*********************************** Scalar - Vector***********************************/ template eIcicleError cpu_scalar_sub( - const Device& device, const T* scalar_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output) + const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output) { - return cpu_scalar_vector_op(VecOperation::SCALAR_SUB_VEC, scalar_a, vec_b, n, config, output); + return cpu_scalar_vector_op(VecOperation::SCALAR_SUB_VEC, scalar_a, vec_b, size, use_single_scalar, config, output); } REGISTER_SCALAR_SUB_VEC_BACKEND("CPU", cpu_scalar_sub); -/*********************************** CONVERT MONTGOMERY ***********************************/ +/*********************************** MUL BY SCALAR***********************************/ template -eIcicleError cpu_convert_montgomery( - const Device& device, const T* input, uint64_t n, bool is_into, const VecOpsConfig& config, T* output) +eIcicleError cpu_scalar_mul( + const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output) { - TasksManager> task_manager(get_nof_workers(config)); - for (uint64_t i = 0; i < n; i += NOF_OPERATIONS_PER_TASK) { - VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); - task_p->send_1op_task( - is_into ? CONVERT_TO_MONTGOMERY : CONVERT_FROM_MONTGOMERY, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - i), - input + i, output + i); + return cpu_scalar_vector_op(VecOperation::SCALAR_MUL_VEC, scalar_a, vec_b, size, use_single_scalar, config, output); +} + +REGISTER_SCALAR_MUL_VEC_BACKEND("CPU", cpu_scalar_mul); + +/*********************************** TRANSPOSE ***********************************/ +// template todo shanie - remove +// eIcicleError cpu_matrix_transpose_basic( +// const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out) +// { +// ICICLE_ASSERT(mat_in && mat_out && nof_rows != 0 && nof_cols != 0) << "Invalid argument"; + +// // Perform the matrix transpose +// for (uint32_t i = 0; i < nof_rows; ++i) { +// for (uint32_t j = 0; j < nof_cols; ++j) { +// mat_out[j * nof_rows + i] = mat_in[i * nof_cols + j]; +// } +// } + +// return eIcicleError::SUCCESS; +// } + +template +eIcicleError cpu_matrix_transpose_batch( + const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out) +{ + const T* cur_mat_in = mat_in; + T* cur_mat_out = mat_out; + uint32_t stride = config.columns_batch? config.batch_size : 1; + const uint64_t total_elements = static_cast(nof_rows) * nof_cols; + for (uint32_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { + // Perform the matrix transpose + for (uint32_t i = 0; i < nof_rows; ++i) { + for (uint32_t j = 0; j < nof_cols; ++j) { + cur_mat_out[stride*(j * nof_rows + i)] = cur_mat_in[stride*(i * nof_cols + j)]; + } + } + cur_mat_in += (config.columns_batch ? 1 : total_elements); + cur_mat_out += (config.columns_batch ? 1 : total_elements); } - task_manager.wait_done(); + return eIcicleError::SUCCESS; } -REGISTER_CONVERT_MONTGOMERY_BACKEND("CPU", cpu_convert_montgomery); +uint32_t gcd(uint32_t a, uint32_t b) { + while (b != 0) { + uint32_t temp = b; + b = a % b; + a = temp; + } + return a; +} -#ifdef EXT_FIELD -REGISTER_VECTOR_ADD_EXT_FIELD_BACKEND("CPU", cpu_vector_add); -REGISTER_VECTOR_ACCUMULATE_EXT_FIELD_BACKEND("CPU", cpu_vector_accumulate); -REGISTER_VECTOR_SUB_EXT_FIELD_BACKEND("CPU", cpu_vector_sub); -REGISTER_VECTOR_MUL_EXT_FIELD_BACKEND("CPU", cpu_vector_mul); -REGISTER_CONVERT_MONTGOMERY_EXT_FIELD_BACKEND("CPU", cpu_convert_montgomery); -#endif // EXT_FIELD +// template //TODO shanie - remove +// void replace_elements(uint32_t start_idx, uint32_t log_nof_rows, uint32_t log_nof_cols, const T* mat_in, T* mat_out) { +// uint64_t idx = start_idx; + +// while (true) { +// uint64_t new_idx = mersenne_mod(idx << log_nof_rows, log_nof_rows+log_nof_cols); // new_idx = (idx< +void gen_necklace(uint32_t t, uint32_t p, uint32_t k, uint32_t length, std::vector& necklace, std::vector& task_indices) { + if (t > length) { + if (length % p == 0 && !std::all_of(necklace.begin() + 1, necklace.begin() + length + 1,[first_element = necklace[1]](uint32_t x) { return x == first_element; })) { + uint32_t start_idx = 0; + uint64_t multiplier = 1; + for (int i = length; i >= 1; --i) { // Compute start_idx as the decimal representation of the necklace + start_idx += necklace[i] * multiplier; + multiplier *= k; + } + // for (int i = 1; i <= length; ++i) { // Compute start_idx as the decimal representation of the necklace //TODO SHANIE - remove + // start_idx = start_idx + necklace[i] * std::pow(k, length - i); + // } + task_indices.push_back(start_idx); + } + return; + } + + necklace[t] = necklace[t - p]; + gen_necklace(t + 1, p, k, length, necklace, task_indices); + + for (int i = necklace[t - p] + 1; i < k; ++i) { + necklace[t] = i; + gen_necklace(t + 1, t, k, length, necklace, task_indices); + } +} -/*********************************** TRANSPOSE ***********************************/ template -eIcicleError cpu_matrix_transpose( +eIcicleError cpu_matrix_transpose_parallel( const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out) { - // Check for invalid arguments - if (!mat_in || !mat_out || nof_rows == 0 || nof_cols == 0) { return eIcicleError::INVALID_ARGUMENT; } + ICICLE_ASSERT(mat_in && mat_out && nof_rows != 0 && nof_cols != 0) << "Invalid argument"; - // Perform the matrix transpose - for (uint32_t i = 0; i < nof_rows; ++i) { - for (uint32_t j = 0; j < nof_cols; ++j) { - mat_out[j * nof_rows + i] = mat_in[i * nof_cols + j]; - } + // check if the number of rows and columns are powers of 2, if not use the basic transpose + if ((nof_rows & (nof_rows - 1)) != 0 || (nof_cols & (nof_cols - 1)) != 0) { + cpu_matrix_transpose_batch(device, mat_in, nof_rows, nof_cols, config, mat_out); + return eIcicleError::SUCCESS; } + uint32_t log_nof_rows = static_cast(std::floor(std::log2(nof_rows))); + uint32_t log_nof_cols = static_cast(std::floor(std::log2(nof_cols))); + uint32_t gcd_value = gcd(log_nof_rows, log_nof_cols); + uint32_t k = 1 << gcd_value; // Base of necklaces + uint32_t length = (log_nof_cols + log_nof_rows) / gcd_value; // length of necklaces. Since all are powers of 2, equvalent to (log_nof_cols + log_nof_rows) / gcd_value; + const uint64_t max_nof_operations = NOF_OPERATIONS_PER_TASK / length; + const uint64_t total_elements = static_cast(nof_rows) * nof_cols; + + std::vector necklace(length + 1, 0); + std::vector start_indices_in_mat; // Collect start indices + gen_necklace(1, 1, k, length, necklace, start_indices_in_mat); + + TasksManager> task_manager(get_nof_workers(config)); + for (uint64_t i = 0; i < start_indices_in_mat.size(); i += max_nof_operations) { + uint64_t nof_operations = std::min((uint64_t)max_nof_operations, start_indices_in_mat.size() - i); + for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { + VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); + task_p->send_replace_elements_task( + REPLACE_ELEMENTS, + config.columns_batch? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements, + nof_operations, + start_indices_in_mat, + i, + log_nof_rows, + log_nof_cols, + config.columns_batch? config.batch_size : 1, + config.columns_batch? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements); + } + } + task_manager.wait_done(); return eIcicleError::SUCCESS; } -REGISTER_MATRIX_TRANSPOSE_BACKEND("CPU", cpu_matrix_transpose); +REGISTER_MATRIX_TRANSPOSE_BACKEND("CPU", cpu_matrix_transpose_parallel); #ifdef EXT_FIELD -REGISTER_MATRIX_TRANSPOSE_EXT_FIELD_BACKEND("CPU", cpu_matrix_transpose); +REGISTER_MATRIX_TRANSPOSE_EXT_FIELD_BACKEND("CPU", cpu_matrix_transpose_parallel); #endif // EXT_FIELD /*********************************** BIT REVERSE ***********************************/ @@ -492,21 +692,26 @@ template eIcicleError cpu_bit_reverse(const Device& device, const T* vec_in, uint64_t size, const VecOpsConfig& config, T* vec_out) { - // Check for invalid arguments - if (!vec_in || !vec_out || size == 0) { return eIcicleError::INVALID_ARGUMENT; } + ICICLE_ASSERT(vec_in && vec_out && size != 0) << "Invalid argument"; - // Calculate log2(size) - int logn = static_cast(std::floor(std::log2(size))); - if ((1ULL << logn) != size) { - return eIcicleError::INVALID_ARGUMENT; // Ensure size is a power of 2 - } + uint32_t logn = static_cast(std::floor(std::log2(size))); + ICICLE_ASSERT((1ULL << logn) == size) << "Invalid argument - size is not a power of 2"; // Perform the bit reverse TasksManager> task_manager(get_nof_workers(config)); - for (uint64_t i = 0; i < size; i += NOF_OPERATIONS_PER_TASK) { - VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); - task_p->send_bitrev_task( - BIT_REVERSE, logn, i, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - i), vec_in, vec_out); + for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { + for (uint64_t i = 0; i < size; i += NOF_OPERATIONS_PER_TASK) { + VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); + + task_p->send_bitrev_task( + BIT_REVERSE, + logn, + i, + std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - i), + config.columns_batch? vec_in + idx_in_batch : vec_in + idx_in_batch*size, + config.columns_batch? config.batch_size : 1, + config.columns_batch? vec_out + idx_in_batch: vec_out + idx_in_batch*size); + } } task_manager.wait_done(); return eIcicleError::SUCCESS; @@ -525,20 +730,27 @@ eIcicleError cpu_slice( const T* vec_in, uint64_t offset, uint64_t stride, - uint64_t size, + uint64_t size_in, + uint64_t size_out, const VecOpsConfig& config, T* vec_out) { - if (vec_in == nullptr || vec_out == nullptr) { - ICICLE_LOG_ERROR << "Error: Invalid argument - input or output vector is null"; - return eIcicleError::INVALID_ARGUMENT; - } + + ICICLE_ASSERT(vec_in != nullptr && vec_out != nullptr) << "Error: Invalid argument - input or output vector is null"; + ICICLE_ASSERT(offset + (size_out-1) * stride < size_in) << "Error: Invalid argument - slice out of bound"; TasksManager> task_manager(get_nof_workers(config)); - for (uint64_t i = 0; i < size; i += NOF_OPERATIONS_PER_TASK) { - VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); - task_p->send_slice_task( - SLICE, stride, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - i), vec_in + offset + i * stride, vec_out + i); + for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { + for (uint64_t i = 0; i < size_out; i += NOF_OPERATIONS_PER_TASK) { + VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); + task_p->send_slice_task( + SLICE, + config.columns_batch? stride*config.batch_size : stride, + config.columns_batch? config.batch_size : 1, + std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size_out - i), + config.columns_batch? vec_in + idx_in_batch + (offset + i * stride)*config.batch_size : vec_in + idx_in_batch*size_in + offset + i * stride, + config.columns_batch? vec_out + idx_in_batch + i*config.batch_size : vec_out + idx_in_batch*size_out + i); + } } task_manager.wait_done(); return eIcicleError::SUCCESS; @@ -549,6 +761,29 @@ REGISTER_SLICE_BACKEND("CPU", cpu_slice); REGISTER_SLICE_EXT_FIELD_BACKEND("CPU", cpu_slice); #endif // EXT_FIELD +/*********************************** Highest non-zero idx ***********************************/ +template +eIcicleError cpu_highest_non_zero_idx( + const Device& device, const T* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx /*OUT*/) +{ + ICICLE_ASSERT(input && out_idx && size !=0) << "Error: Invalid argument"; + uint64_t stride = config.columns_batch? config.batch_size : 1; + for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) { + out_idx[idx_in_batch] = -1; // zero vector is considered '-1' since 0 would be zero in vec[0] + const T* curr_input = config.columns_batch? input + idx_in_batch : input + idx_in_batch * size; // Pointer to the current vector + for (int64_t i = size - 1; i >= 0; --i) { + if (curr_input[i * stride] != T::zero()) { + out_idx[idx_in_batch] = i; + break; + } + } + } + return eIcicleError::SUCCESS; +} + +REGISTER_HIGHEST_NON_ZERO_IDX_BACKEND("CPU", cpu_highest_non_zero_idx); + + /*********************************** Polynomial evaluation ***********************************/ template @@ -561,12 +796,18 @@ eIcicleError cpu_poly_eval( const VecOpsConfig& config, T* evals /*OUT*/) { + ICICLE_ASSERT(coeffs && domain && evals && coeffs_size != 0 && domain_size != 0) << "Error: Invalid argument"; // using Horner's method // example: ax^2+bx+c is computed as (1) r=a, (2) r=r*x+b, (3) r=r*x+c - for (uint64_t eval_idx = 0; eval_idx < domain_size; ++eval_idx) { - evals[eval_idx] = coeffs[coeffs_size - 1]; - for (int64_t coeff_idx = coeffs_size - 2; coeff_idx >= 0; --coeff_idx) { - evals[eval_idx] = evals[eval_idx] * domain[eval_idx] + coeffs[coeff_idx]; + uint64_t stride = config.columns_batch ? config.batch_size : 1; + for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) { + const T* curr_coeffs = config.columns_batch? coeffs + idx_in_batch : coeffs + idx_in_batch * coeffs_size; + T* curr_evals = config.columns_batch? evals + idx_in_batch : evals + idx_in_batch * domain_size; + for (uint64_t eval_idx = 0; eval_idx < domain_size; ++eval_idx) { + curr_evals[eval_idx * stride] = curr_coeffs[(coeffs_size - 1) * stride]; + for (int64_t coeff_idx = coeffs_size - 2; coeff_idx >= 0; --coeff_idx) { + curr_evals[eval_idx * stride] = curr_evals[eval_idx * stride] * domain[eval_idx] + curr_coeffs[coeff_idx * stride]; + } } } return eIcicleError::SUCCESS; @@ -574,23 +815,6 @@ eIcicleError cpu_poly_eval( REGISTER_POLYNOMIAL_EVAL("CPU", cpu_poly_eval); -/*********************************** Highest non-zero idx ***********************************/ -template -eIcicleError cpu_highest_non_zero_idx( - const Device& device, const T* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx /*OUT*/) -{ - *out_idx = -1; // zero vector is considered '-1' since 0 would be zero in vec[0] - for (int64_t i = size - 1; i >= 0; --i) { - if (input[i] != T::zero()) { - *out_idx = i; - break; - } - } - return eIcicleError::SUCCESS; -} - -REGISTER_HIGHEST_NON_ZERO_IDX_BACKEND("CPU", cpu_highest_non_zero_idx); - /*============================== polynomial division ==============================*/ template void school_book_division_step_cpu(T* r, T* q, const T* b, int deg_r, int deg_b, const T& lc_b_inv) @@ -627,21 +851,24 @@ eIcicleError cpu_poly_divide( ICICLE_ASSERT(q_size >= (numerator_deg - denumerator_deg + 1)) << "polynomial division expects q(x) size to be at least deg(numerator)-deg(denumerator)+1"; - ICICLE_CHECK(icicle_copy_async(r_out, numerator, r_size * sizeof(T), config.stream)); - - // invert largest coeff of b - const T& lc_b_inv = T::inverse(denumerator[denumerator_deg]); - - int64_t deg_r = numerator_deg; - while (deg_r >= denumerator_deg) { - // each iteration is removing the largest monomial in r until deg(r)= denumerator_deg) { + // each iteration is removing the largest monomial in r until deg(r); + scalar_t* output)>; using scalarVectorOpImpl = std::function; - using scalarVectorOpImplInplaceA = std::function; + + using vectorVectorOpImpl = std::function; + + using vectorVectorOpImplInplaceA = std::function; void register_vector_sum(const std::string& deviceType, scalarVectorReduceOpImpl impl); @@ -51,7 +59,7 @@ namespace icicle { - void register_vector_add(const std::string& deviceType, scalarVectorOpImpl impl); + void register_vector_add(const std::string& deviceType, vectorVectorOpImpl impl); #define REGISTER_VECTOR_ADD_BACKEND(DEVICE_TYPE, FUNC) \ namespace { \ @@ -61,7 +69,7 @@ namespace icicle { }(); \ } - void register_vector_accumulate(const std::string& deviceType, scalarVectorOpImplInplaceA impl); + void register_vector_accumulate(const std::string& deviceType, vectorVectorOpImplInplaceA impl); #define REGISTER_VECTOR_ACCUMULATE_BACKEND(DEVICE_TYPE, FUNC) \ namespace { \ @@ -71,7 +79,7 @@ namespace icicle { }(); \ } - void register_vector_sub(const std::string& deviceType, scalarVectorOpImpl impl); + void register_vector_sub(const std::string& deviceType, vectorVectorOpImpl impl); #define REGISTER_VECTOR_SUB_BACKEND(DEVICE_TYPE, FUNC) \ namespace { \ static bool UNIQUE(_reg_vec_sub) = []() -> bool { \ @@ -80,7 +88,7 @@ namespace icicle { }(); \ } - void register_vector_mul(const std::string& deviceType, scalarVectorOpImpl impl); + void register_vector_mul(const std::string& deviceType, vectorVectorOpImpl impl); #define REGISTER_VECTOR_MUL_BACKEND(DEVICE_TYPE, FUNC) \ namespace { \ @@ -90,7 +98,7 @@ namespace icicle { }(); \ } - void register_vector_div(const std::string& deviceType, scalarVectorOpImpl impl); + void register_vector_div(const std::string& deviceType, vectorVectorOpImpl impl); #define REGISTER_VECTOR_DIV_BACKEND(DEVICE_TYPE, FUNC) \ namespace { \ @@ -134,7 +142,7 @@ namespace icicle { const Device& device, const scalar_t* input, uint64_t size, - bool is_into, + bool is_to_montgomery, const VecOpsConfig& config, scalar_t* output)>; @@ -184,7 +192,8 @@ namespace icicle { const scalar_t* input, uint64_t offset, uint64_t stride, - uint64_t size, + uint64_t size_in, + uint64_t size_out, const VecOpsConfig& config, scalar_t* output)>; @@ -266,12 +275,12 @@ namespace icicle { const Device& device, const extension_t* vec_a, const extension_t* vec_b, - uint64_t n, + uint64_t size, const VecOpsConfig& config, extension_t* output)>; using extFieldVectorOpImplInplaceA = std::function; + const Device& device, extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config)>; void register_extension_vector_add(const std::string& deviceType, extFieldVectorOpImpl impl); @@ -316,7 +325,7 @@ namespace icicle { const Device& device, const extension_t* input, uint64_t size, - bool is_into, + bool is_to_montgomery, const VecOpsConfig& config, extension_t* output)>; diff --git a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h index f0643f978..7c0cca845 100644 --- a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h +++ b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h @@ -65,7 +65,7 @@ namespace icicle { config.is_async = true; config.stream = m_stream; - ICICLE_CHECK(icicle::slice(in_coeffs, offset, stride, out_size, config, out_coeffs)); + ICICLE_CHECK(icicle::slice(in_coeffs, offset, stride, size, out_size, config, out_coeffs)); } void add_sub(PolyContext& res, PolyContext a, PolyContext b, bool add1_sub0) @@ -126,7 +126,7 @@ namespace icicle { C zero = C::zero(); config.is_a_on_device = false; ICICLE_CHECK( - scalar_sub_vec(&zero, b_mem_p + min_op_size, b->get_nof_elements() - min_op_size, config, res_mem_p)); + scalar_sub_vec(&zero, b_mem_p + min_op_size, b->get_nof_elements() - min_op_size, true, config, res_mem_p)); } } @@ -173,7 +173,7 @@ namespace icicle { config.is_result_on_device = true; config.is_async = true; config.stream = m_stream; - icicle::scalar_mul_vec(&scalar, p_elements_p, N, config, out_evals_p); + icicle::scalar_mul_vec(&scalar, p_elements_p, N, true, config, out_evals_p); } void multiply_with_padding(PolyContext c, PolyContext a, PolyContext b) @@ -409,7 +409,7 @@ namespace icicle { config.is_async = true; config.stream = m_stream; icicle::scalar_mul_vec( - &v_coset_eval, numerator_evals_reversed_p + N /*second half is the reversed coset*/, N, config, + &v_coset_eval, numerator_evals_reversed_p + N /*second half is the reversed coset*/, N, true, config, out_evals_reversed_p); // INTT back from reversed evals on coset to coeffs @@ -450,7 +450,7 @@ namespace icicle { config.is_result_on_device = true; config.is_async = true; config.stream = m_stream; - icicle::scalar_mul_vec(&v_coset_eval, out_evals_reversed_p, N, config, out_evals_reversed_p); + icicle::scalar_mul_vec(&v_coset_eval, out_evals_reversed_p, N, true, config, out_evals_reversed_p); // (3) INTT back from coset to coeffs ntt_config.are_inputs_on_device = true; @@ -547,7 +547,7 @@ namespace icicle { config.is_async = true; config.stream = m_stream; ICICLE_CHECK( - icicle::slice(get_context_storage_immutable(p), 0 /*offset*/, stride, domain_size, config, d_evals)); + icicle::slice(get_context_storage_immutable(p), 0 /*offset*/, stride, poly_size, domain_size, config, d_evals)); } else { ICICLE_CHECK(icicle_memset(d_evals, 0, domain_size * sizeof(I))); auto ntt_config = default_ntt_config(); diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h index e0cf6f7af..42dfca8bd 100644 --- a/icicle/include/icicle/vec_ops.h +++ b/icicle/include/icicle/vec_ops.h @@ -17,16 +17,23 @@ namespace icicle { * @note APIs with a single input, ignore input b. */ struct VecOpsConfig { - icicleStreamHandle stream; /**< Stream for asynchronous execution. */ - bool is_a_on_device; /**< True if `a` is on the device, false if it is not. Default value: false. */ - bool is_b_on_device; /**< True if `b` is on the device, false if it is not. Default value: false. OPTIONAL. */ - bool is_result_on_device; /**< If true, the output is preserved on the device, otherwise on the host. Default value: - false. */ - bool is_async; /**< Whether to run the vector operations asynchronously. - If set to `true`, the function will be non-blocking and synchronization - must be explicitly managed using `cudaStreamSynchronize` or `cudaDeviceSynchronize`. - If set to `false`, the function will block the current CPU thread. */ - ConfigExtension* ext = nullptr; /**< Backend-specific extension. */ + icicleStreamHandle stream; /** Stream for asynchronous execution. */ + bool is_a_on_device; /** True if `a` is on the device, false if it is not. Default value: false. */ + bool is_b_on_device; /** True if `b` is on the device, false if it is not. Default value: false. OPTIONAL. */ + bool is_result_on_device; /** If true, the output is preserved on the device, otherwise on the host. Default value: + false. */ + bool is_async; /** Whether to run the vector operations asynchronously. + If set to `true`, the function will be non-blocking and synchronization + must be explicitly managed using `cudaStreamSynchronize` or `cudaDeviceSynchronize`. + If set to `false`, the function will block the current CPU thread. */ + int batch_size; /** Number of vectors (or operations) to process in a batch. + Each vector operation will be performed independently on each batch element. + Default value: 1. */ + bool + columns_batch; /** True if the batched vectors are stored as columns in a 2D array (i.e., the vectors are strided in memory as columns of a matrix). + If false, the batched vectors are stored contiguously in memory (e.g., as rows or in a flat array). + Default value: false. */ + ConfigExtension* ext = nullptr; /** Backend-specific extension. */ }; /** @@ -42,52 +49,29 @@ namespace icicle { false, // is_b_on_device false, // is_result_on_device false, // is_async + 1, // batch_size + false, // columns_batch }; return config; } - // Reduction operations - - /** - * @brief Computes the product of all elements in a vector. - * - * @tparam T Type of the elements in the vector. - * @param vec_a Input vector. - * @param n Number of elements in the vector. - * @param config Configuration for the operation. - * @param output Output scalar to store the result. - * @return eIcicleError Error code indicating success or failure. - */ - - template - eIcicleError vector_product(const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output, uint64_t offset, uint64_t stride); - - /** - * @brief Computes the sum of all elements in a vector. - * - * @tparam T Type of the elements in the vector. - * @param vec_a Input vector. - * @param n Number of elements in the vector. - * @param config Configuration for the operation. - * @param output Output scalar to store the result. - * @return eIcicleError Error code indicating success or failure. - */ - - template - eIcicleError vector_sum(const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output, uint64_t offset, uint64_t stride); - - // Element-wise vector operations /** * @brief Adds two vectors element-wise. * * @tparam T Type of the elements in the vectors. - * @param vec_a Input vector `a`. - * @param vec_b Input vector `b`. - * @param size Number of elements in the vectors. + * @param vec_a Pointer to the first input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously in memory. + * - If `true`, vectors are stored as columns in a 2D array. + * @param vec_b Pointer to the second input vector(s). + * - The storage layout should match that of `vec_a`. + * @param size Number of elements in each vector. * @param config Configuration for the operation. - * @param output Output vector to store the result. + * @param output Pointer to the output vector(s) where the results will be stored. + * The output array should have the same storage layout as the input vectors. * @return eIcicleError Error code indicating success or failure. */ template @@ -97,24 +81,35 @@ namespace icicle { * @brief Accumulates the elements of two vectors element-wise and stores the result in the first vector. * * @tparam T Type of the elements in the vectors. - * @param vec_a Input/output vector `a`. The result will be written back to this vector. - * @param vec_b Input vector `b`. - * @param size Number of elements in the vectors. + * @param vec_a Pointer to the first Input/output vector(s). The result will be written back to this vector. + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously in memory. + * - If `true`, vectors are stored as columns in a 2D array. + * @param vec_b Pointer to the second input vector(s). + * - The storage layout should match that of `vec_a`. + * @param size Number of elements in each vector. * @param config Configuration for the operation. * @return eIcicleError Error code indicating success or failure. */ template - eIcicleError vector_accumulate(T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config); + eIcicleError vector_accumulate(T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config); // use vector_add (inplace) /** * @brief Subtracts vector `b` from vector `a` element-wise. * * @tparam T Type of the elements in the vectors. - * @param vec_a Input vector `a`. - * @param vec_b Input vector `b`. - * @param size Number of elements in the vectors. + * @param vec_a Pointer to the first input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously in memory. + * - If `true`, vectors are stored as columns in a 2D array. + * @param vec_b Pointer to the second input vector(s). + * - The storage layout should match that of `vec_a`. + * @param size Number of elements in each vector. * @param config Configuration for the operation. - * @param output Output vector to store the result. + * @param output Pointer to the output vector(s) where the results will be stored. + * The output array should have the same storage layout as the input vectors. * @return eIcicleError Error code indicating success or failure. */ template @@ -124,11 +119,17 @@ namespace icicle { * @brief Multiplies two vectors element-wise. * * @tparam T Type of the elements in the vectors. - * @param vec_a Input vector `a`. - * @param vec_b Input vector `b`. - * @param size Number of elements in the vectors. + * @param vec_a Pointer to the first input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously in memory. + * - If `true`, vectors are stored as columns in a 2D array. + * @param vec_b Pointer to the second input vector(s). + * - The storage layout should match that of `vec_a`. + * @param size Number of elements in each vector. * @param config Configuration for the operation. - * @param output Output vector to store the result. + * @param output Pointer to the output vector(s) where the results will be stored. + * The output array should have the same storage layout as the input vectors. * @return eIcicleError Error code indicating success or failure. */ template @@ -138,11 +139,17 @@ namespace icicle { * @brief Divides vector `a` by vector `b` element-wise. * * @tparam T Type of the elements in the vectors. - * @param vec_a Input vector `a`. - * @param vec_b Input vector `b`. - * @param size Number of elements in the vectors. + * @param vec_a Pointer to the first input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously in memory. + * - If `true`, vectors are stored as columns in a 2D array. + * @param vec_b Pointer to the second input vector(s). + * - The storage layout should match that of `vec_a`. + * @param size Number of elements in each vector. * @param config Configuration for the operation. - * @param output Output vector to store the result. + * @param output Pointer to the output vector(s) where the results will be stored. + * The output array should have the same storage layout as the input vectors. * @return eIcicleError Error code indicating success or failure. */ template @@ -152,15 +159,60 @@ namespace icicle { * @brief Converts elements to and from Montgomery form. * * @tparam T Type of the elements. - * @param input Input vector. - * @param size Number of elements in the input vector. - * @param is_into True to convert into Montgomery form, false to convert out of Montgomery form. + * @param input Pointer to the input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously in memory. + * - If `true`, vectors are stored as columns in a 2D array. + * @param size Number of elements in each vector. + * @param is_to_montgomery True to convert into Montgomery form, false to convert out of Montgomery form. * @param config Configuration for the operation. - * @param output Output vector to store the result. + * @param output Pointer to the output vector(s) where the results will be stored. + * The output array should have the same storage layout as the input vectors. * @return eIcicleError Error code indicating success or failure. */ template - eIcicleError convert_montgomery(const T* input, uint64_t size, bool is_into, const VecOpsConfig& config, T* output); + eIcicleError convert_montgomery(const T* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, T* output); + + // Reduction operations + + /** + * @brief Computes the sum of all elements in each vector in a batch. + * + * @tparam T Type of the elements in the vector. + * @param vec_a Pointer to the input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously. + * - If `true`, vectors are stored as columns in a 2D array. + * @param size Number of elements in each vector. + * @param config Configuration for the operation. + * @param output Pointer to the output array where the results will be stored. + * @return eIcicleError Error code indicating success or failure. + */ + + template + eIcicleError vector_sum(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output); + + /** + * @brief Computes the product of all elements in each vector in the batch. + * + * @tparam T Type of the elements in the vectors. + * @param vec_a Pointer to the input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously. + * - If `true`, vectors are stored as columns in a 2D array. + * @param size Number of elements in each vector. + * @param config Configuration for the operation. + * @param output Pointer to the output array where the results will be stored. + * @return eIcicleError Error code indicating success or failure. + */ + + template + eIcicleError vector_product(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output); + + // Scalar-Vector operations @@ -168,43 +220,66 @@ namespace icicle { * @brief Adds a scalar to each element of a vector. * * @tparam T Type of the elements in the vector and the scalar. - * @param scalar_a Input scalar. - * @param vec_b Input vector. - * @param size Number of elements in the vector. + * @param scalar_a Pointer to the input scalar(s). + * - If `use_single_scalar` is `true`, this should point to a single scalar value. + * - If `use_single_scalar` is `false`, this should point to an array of scalars with length `config.batch_size`. + * @param vec_b Pointer to the input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously. + * - If `true`, vectors are stored as columns in a 2D array. + * @param size Number of elements in a vector. + * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of scalars (`false`). * @param config Configuration for the operation. - * @param output Output vector to store the result. + * @param output Pointer to the output vector(s) where the results will be stored. * @return eIcicleError Error code indicating success or failure. + * @note To subtract a scalar from each element of a vector - use scalar_add_vec with negative scalar. */ template - eIcicleError scalar_add_vec(const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output); + eIcicleError scalar_add_vec(const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output); /** * @brief Subtracts each element of a vector from a scalar, elementwise (res[i]=scalar-vec[i]). * * @tparam T Type of the elements in the vector and the scalar. - * @param scalar_a Input scalar. - * @param vec_b Input vector. - * @param size Number of elements in the vector. + * @param scalar_a Pointer to Input scalar(s). + * - If `use_single_scalar` is `true`, this should point to a single scalar value. + * - If `use_single_scalar` is `false`, this should point to an array of scalars with length `config.batch_size`. + * @param vec_b Pointer to the input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously. + * - If `true`, vectors are stored as columns in a 2D array. + * @param size Number of elements in a vector. + * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of scalars (`false`). * @param config Configuration for the operation. - * @param output Output vector to store the result. + * @param output Pointer to the output vector(s) where the results will be stored. * @return eIcicleError Error code indicating success or failure. + * @note To subtract a scalar from each element of a vector - use scalar_add_vec with negative scalar. */ template - eIcicleError scalar_sub_vec(const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output); + eIcicleError scalar_sub_vec(const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output); /** * @brief Multiplies each element of a vector by a scalar. * * @tparam T Type of the elements in the vector and the scalar. - * @param scalar_a Input scalar. - * @param vec_b Input vector. - * @param size Number of elements in the vector. + * @param scalar_a Pointer to Input scalar(s). + * - If `use_single_scalar` is `true`, this should point to a single scalar value. + * - If `use_single_scalar` is `false`, this should point to an array of scalars with length `config.batch_size`. + * @param vec_b Pointer to the input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously. + * - If `true`, vectors are stored as columns in a 2D array. + * @param size Number of elements in a vector. + * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of scalars (`false`). * @param config Configuration for the operation. - * @param output Output vector to store the result. + * @param output Pointer to the output vector(s) where the results will be stored. * @return eIcicleError Error code indicating success or failure. */ template - eIcicleError scalar_mul_vec(const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output); + eIcicleError scalar_mul_vec(const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output); // Matrix operations @@ -212,56 +287,70 @@ namespace icicle { * @brief Transposes a matrix. * * @tparam T Type of the elements in the matrix. - * @param mat_in Input matrix. - * @param nof_rows Number of rows in the input matrix. - * @param nof_cols Number of columns in the input matrix. + * @param mat_in Pointer to the input matrix or matrices. + * @param nof_rows Number of rows in each input matrix. + * @param nof_cols Number of columns in each input matrix. * @param config Configuration for the operation. - * @param mat_out Output matrix to store the result. + * @param mat_out Pointer to the output matrix or matrices where the transposed matrices will be stored. * @return eIcicleError Error code indicating success or failure. + * @note The input matrices are assumed to be stored in row-major order. + * This function transposes an input matrix or a batch of matrices. */ template eIcicleError matrix_transpose(const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out); - + + // Miscellaneous operations /** - * @brief Reorders the vector elements based on bit-reverse. That is out[i]=in[bitrev[i]]. + * @brief Reorders the vector (or batch of vectors) elements based on bit-reverse. That is out[i]=in[bitrev[i]]. * * @tparam T Type of the elements in the vector. - * @param vec_in Input vector. - * @param size Number of elements in the input vector. + * @param vec_in Pointer to the input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously. + * - If `true`, vectors are stored as columns in a 2D array. + * @param size Number of elements in each vector. * @param config Configuration for the operation. - * @param vec_out Output vector to store the result. + * @param vec_out Pointer to the output vector(s) where the results will be stored. + * The output array should have the same storage layout as the input vectors. * @return eIcicleError Error code indicating success or failure. + * @note If `vec_in` and `vec_out` point to the same memory location, the operation is performed in-place. */ template eIcicleError bit_reverse(const T* vec_in, uint64_t size, const VecOpsConfig& config, T* vec_out); /** - * @brief Extracts a slice from a vector. + * @brief Extracts a slice from a vector or batch of vectors. * * @tparam T Type of the elements in the vector. - * @param vec_in Input vector. - * @param offset Offset from which to start the slice. + * @param vec_in Pointer to the input vector(s). + * @param offset Offset from which to start the slice in each vector. * @param stride Stride between elements in the slice. - * @param size Number of elements in the slice. - * @param config Configuration for the operation. - * @param vec_out Output vector to store the result. + * @param size_in Number of elements in one input vector. + * @param size_out Number of elements in one input vector. + * @param config Configuration for the operation. + * @param vec_out Pointer to the output vector(s) where the results will be stored. + * The output array should have the same storage layout as the input vectors. * @return eIcicleError Error code indicating success or failure. + * @note The total input size is `size_in * config.batch_size`. + * The total output size is `size_out * config.batch_size`. */ template eIcicleError - slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size, const VecOpsConfig& config, T* vec_out); + slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size_in, uint64_t size_out, const VecOpsConfig& config, T* vec_out); /** - * @brief Finds the highest non-zero index in a vector. + * @brief Finds the highest non-zero index in a vector or batch of vectors. * * @tparam T Type of the elements in the vector. - * @param vec_in Input vector. - * @param size Number of elements in the input vector. + * @param vec_in Pointer to the input vector(s). + * @param size Number of elements in each input vector. * @param config Configuration for the operation. - * @param out_idx Output index of the highest non-zero element. + * @param out_idx Pointer to an array where the output indices of the highest non-zero element in each input vector will be stored. + * The array should have a length of at least `config.batch_size`. * @return eIcicleError Error code indicating success or failure. */ template @@ -271,12 +360,20 @@ namespace icicle { * @brief Evaluates a polynomial at given domain points. * * @tparam T Type of the elements in the polynomial and domain. - * @param coeffs Pointer to the array of coefficients of the polynomial. - * @param coeffs_size Number of coefficients in the polynomial. - * @param domain Pointer to the array of points at which to evaluate the polynomial. + * @param coeffs Pointer to the array of coefficients of the polynomial(s). + * - The size of `coeffs` should be `coeffs_size * batch_size`. + * - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored contiguously. + * - If `config.columns_batch` is `true`, coefficients are interleaved. + * @param coeffs_size Number of coefficients in each polynomial. + * @param domain Pointer to the array of points at which to evaluate the polynomial(s). + * - The same domain is used for all polynomials. + * - The size of `domain` should be `domain_size`. * @param domain_size Number of domain points. * @param config Configuration for the operation. * @param evals Pointer to the array where the evaluated results will be stored. This is an output parameter. + * - The size of `evals` should be `domain_size * batch_size`. + * - If `config.columns_batch` is `false`, results for each polynomial are stored contiguously. + * - If `config.columns_batch` is `true`, results are interleaved. * @return eIcicleError Error code indicating success or failure. */ template @@ -289,19 +386,30 @@ namespace icicle { T* evals /*OUT*/); /** - * @brief Divides two polynomials. + * @brief Divides two polynomials or batch of couples of polynomials. * * @tparam T Type of the elements in the polynomials. - * @param numerator Pointer to the array of coefficients of the numerator polynomial. + * @param numerator Pointer to the array of coefficients of the numerator polynomial(s). + * - The size of `numerator` should be `(numerator_deg + 1) * batch_size`. + * - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored contiguously. + * - If `config.columns_batch` is `true`, coefficients are interleaved. * @param numerator_deg Degree of the numerator polynomial. - * @param denominator Pointer to the array of coefficients of the denominator polynomial. + * @param denominator Pointer to the array of coefficients of the denominator polynomial(s). + * - Storage layout is similar to `numerator`. * @param denominator_deg Degree of the denominator polynomial. * @param config Configuration for the operation. - * @param q_out Pointer to the array where the quotient will be stored. This is an output parameter. - * @param q_size Size of the quotient array. - * @param r_out Pointer to the array where the remainder will be stored. This is an output parameter. + * @param q_out Pointer to the array where the quotient polynomial(s) will be stored. This is an output parameter. + * - The storage layout should match that of `numerator`. + * @param q_size Size of the quotient array for one polynomial. + * @param r_out Pointer to the array where the remainder polynomial(s) will be stored. This is an output parameter. + * - The storage layout should match that of `numerator`. + * - The size of `r_out` should be sufficient to hold the remainder coefficients for each polynomial. * @param r_size Size of the remainder array. * @return eIcicleError Error code indicating success or failure. + * + * @note The degrees should satisfy `numerator_deg >= denominator_deg`. + * The sizes `q_size` and `r_size` must be at least `numerator_deg - denominator_deg + 1` and `denominator_deg`, respectively. + * The function assumes that the input and output arrays are properly allocated. */ template eIcicleError polynomial_division( diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp index 29ab25ba0..db86e6e73 100644 --- a/icicle/src/vec_ops.cpp +++ b/icicle/src/vec_ops.cpp @@ -8,225 +8,225 @@ namespace icicle { ICICLE_DISPATCHER_INST(VectorProductDispatcher, vector_product, scalarVectorReduceOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_product)( - const scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, scalar_t* output, uint64_t offset, uint64_t stride) + const scalar_t* vec_a, uint64_t size, const VecOpsConfig* config, scalar_t* output) { - return VectorProductDispatcher::execute(vec_a, n, *config, output, offset, stride); + return VectorProductDispatcher::execute(vec_a, size, *config, output); } template <> eIcicleError - vector_product(const scalar_t* vec_a, uint64_t n, const VecOpsConfig& config, scalar_t* output, uint64_t offset, uint64_t stride) + vector_product(const scalar_t* vec_a, uint64_t size, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, vector_product)(vec_a, n, &config, output, offset, stride); + return CONCAT_EXPAND(FIELD, vector_product)(vec_a, size, &config, output); } /*********************************** REDUCE SUM ****************************/ ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, scalarVectorReduceOpImpl ); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_sum)( - const scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, scalar_t* output, uint64_t offset=0, uint64_t stride=1) + const scalar_t* vec_a, uint64_t size, const VecOpsConfig* config, scalar_t* output) { - return VectorSumDispatcher::execute(vec_a, n, *config, output, offset, stride); + return VectorSumDispatcher::execute(vec_a, size, *config, output); } template <> eIcicleError - vector_sum(const scalar_t* vec_a, uint64_t n, const VecOpsConfig& config, scalar_t* output, uint64_t offset, uint64_t stride) + vector_sum(const scalar_t* vec_a, uint64_t size, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, vector_sum)(vec_a, n, &config, output, offset, stride); + return CONCAT_EXPAND(FIELD, vector_sum)(vec_a, size, &config, output); } /*********************************** ADD ***********************************/ - ICICLE_DISPATCHER_INST(VectorAddDispatcher, vector_add, scalarVectorOpImpl); + ICICLE_DISPATCHER_INST(VectorAddDispatcher, vector_add, vectorVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_add)( - const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output) + const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output) { - return VectorAddDispatcher::execute(vec_a, vec_b, n, *config, output); + return VectorAddDispatcher::execute(vec_a, vec_b, size, *config, output); } template <> eIcicleError - vector_add(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output) + vector_add(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, vector_add)(vec_a, vec_b, n, &config, output); + return CONCAT_EXPAND(FIELD, vector_add)(vec_a, vec_b, size, &config, output); } #ifdef EXT_FIELD ICICLE_DISPATCHER_INST(VectorAddExtFieldDispatcher, extension_vector_add, extFieldVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_add)( - const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig* config, extension_t* output) + const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config, extension_t* output) { - return VectorAddExtFieldDispatcher::execute(vec_a, vec_b, n, *config, output); + return VectorAddExtFieldDispatcher::execute(vec_a, vec_b, size, *config, output); } template <> eIcicleError vector_add( - const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig& config, extension_t* output) + const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config, extension_t* output) { - return CONCAT_EXPAND(FIELD, extension_vector_add)(vec_a, vec_b, n, &config, output); + return CONCAT_EXPAND(FIELD, extension_vector_add)(vec_a, vec_b, size, &config, output); } #endif // EXT_FIELD /*********************************** ACCUMULATE ***********************************/ - ICICLE_DISPATCHER_INST(VectorAccumulateDispatcher, vector_accumulate, scalarVectorOpImplInplaceA); + ICICLE_DISPATCHER_INST(VectorAccumulateDispatcher, vector_accumulate, vectorVectorOpImplInplaceA); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_accumulate)( - scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config) + scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config) { - return VectorAccumulateDispatcher::execute(vec_a, vec_b, n, *config); + return VectorAccumulateDispatcher::execute(vec_a, vec_b, size, *config); } template <> - eIcicleError vector_accumulate(scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config) + eIcicleError vector_accumulate(scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config) { - return CONCAT_EXPAND(FIELD, vector_accumulate)(vec_a, vec_b, n, &config); + return CONCAT_EXPAND(FIELD, vector_accumulate)(vec_a, vec_b, size, &config); } #ifdef EXT_FIELD ICICLE_DISPATCHER_INST(VectorAccumulateExtFieldDispatcher, extension_vector_accumulate, extFieldVectorOpImplInplaceA); extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_accumulate)( - extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig* config) + extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config) { - return VectorAccumulateExtFieldDispatcher::execute(vec_a, vec_b, n, *config); + return VectorAccumulateExtFieldDispatcher::execute(vec_a, vec_b, size, *config); } template <> - eIcicleError vector_accumulate(extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig& config) + eIcicleError vector_accumulate(extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config) { - return CONCAT_EXPAND(FIELD, extension_vector_accumulate)(vec_a, vec_b, n, &config); + return CONCAT_EXPAND(FIELD, extension_vector_accumulate)(vec_a, vec_b, size, &config); } #endif // EXT_FIELD /*********************************** SUB ***********************************/ - ICICLE_DISPATCHER_INST(VectorSubDispatcher, vector_sub, scalarVectorOpImpl); + ICICLE_DISPATCHER_INST(VectorSubDispatcher, vector_sub, vectorVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_sub)( - const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output) + const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output) { - return VectorSubDispatcher::execute(vec_a, vec_b, n, *config, output); + return VectorSubDispatcher::execute(vec_a, vec_b, size, *config, output); } template <> eIcicleError - vector_sub(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output) + vector_sub(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, vector_sub)(vec_a, vec_b, n, &config, output); + return CONCAT_EXPAND(FIELD, vector_sub)(vec_a, vec_b, size, &config, output); } #ifdef EXT_FIELD ICICLE_DISPATCHER_INST(VectorSubExtFieldDispatcher, extension_vector_sub, extFieldVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_sub)( - const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig* config, extension_t* output) + const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config, extension_t* output) { - return VectorSubExtFieldDispatcher::execute(vec_a, vec_b, n, *config, output); + return VectorSubExtFieldDispatcher::execute(vec_a, vec_b, size, *config, output); } template <> eIcicleError vector_sub( - const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig& config, extension_t* output) + const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config, extension_t* output) { - return CONCAT_EXPAND(FIELD, extension_vector_sub)(vec_a, vec_b, n, &config, output); + return CONCAT_EXPAND(FIELD, extension_vector_sub)(vec_a, vec_b, size, &config, output); } #endif // EXT_FIELD /*********************************** MUL ***********************************/ - ICICLE_DISPATCHER_INST(VectorMulDispatcher, vector_mul, scalarVectorOpImpl); + ICICLE_DISPATCHER_INST(VectorMulDispatcher, vector_mul, vectorVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_mul)( - const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output) + const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output) { - return VectorMulDispatcher::execute(vec_a, vec_b, n, *config, output); + return VectorMulDispatcher::execute(vec_a, vec_b, size, *config, output); } template <> eIcicleError - vector_mul(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output) + vector_mul(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, vector_mul)(vec_a, vec_b, n, &config, output); + return CONCAT_EXPAND(FIELD, vector_mul)(vec_a, vec_b, size, &config, output); } #ifdef EXT_FIELD ICICLE_DISPATCHER_INST(VectorMulExtFieldDispatcher, extension_vector_mul, extFieldVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_mul)( - const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig* config, extension_t* output) + const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config, extension_t* output) { - return VectorMulExtFieldDispatcher::execute(vec_a, vec_b, n, *config, output); + return VectorMulExtFieldDispatcher::execute(vec_a, vec_b, size, *config, output); } template <> eIcicleError vector_mul( - const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig& config, extension_t* output) + const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config, extension_t* output) { - return CONCAT_EXPAND(FIELD, extension_vector_mul)(vec_a, vec_b, n, &config, output); + return CONCAT_EXPAND(FIELD, extension_vector_mul)(vec_a, vec_b, size, &config, output); } #endif // EXT_FIELD /*********************************** DIV ***********************************/ - ICICLE_DISPATCHER_INST(VectorDivDispatcher, vector_div, scalarVectorOpImpl); + ICICLE_DISPATCHER_INST(VectorDivDispatcher, vector_div, vectorVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_div)( - const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output) + const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output) { - return VectorDivDispatcher::execute(vec_a, vec_b, n, *config, output); + return VectorDivDispatcher::execute(vec_a, vec_b, size, *config, output); } template <> eIcicleError - vector_div(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output) + vector_div(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, vector_div)(vec_a, vec_b, n, &config, output); + return CONCAT_EXPAND(FIELD, vector_div)(vec_a, vec_b, size, &config, output); } /*********************************** (Scalar + Vector) ELEMENT WISE ***********************************/ ICICLE_DISPATCHER_INST(ScalarAddDispatcher, scalar_add_vec, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_add_vec)( - const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output) + const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig* config, scalar_t* output) { - return ScalarAddDispatcher::execute(scalar_a, vec_b, n, *config, output); + return ScalarAddDispatcher::execute(scalar_a, vec_b, size, use_single_scalar, *config, output); } template <> eIcicleError scalar_add_vec( - const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output) + const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, scalar_add_vec)(scalar_a, vec_b, n, &config, output); + return CONCAT_EXPAND(FIELD, scalar_add_vec)(scalar_a, vec_b, size, use_single_scalar, &config, output); } /*********************************** (Scalar - Vector) ELEMENT WISE ***********************************/ ICICLE_DISPATCHER_INST(ScalarSubDispatcher, scalar_sub_vec, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_sub_vec)( - const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output) + const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig* config, scalar_t* output) { - return ScalarSubDispatcher::execute(scalar_a, vec_b, n, *config, output); + return ScalarSubDispatcher::execute(scalar_a, vec_b, size, use_single_scalar, *config, output); } template <> eIcicleError scalar_sub_vec( - const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output) + const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, scalar_sub_vec)(scalar_a, vec_b, n, &config, output); + return CONCAT_EXPAND(FIELD, scalar_sub_vec)(scalar_a, vec_b, size, use_single_scalar, &config, output); } /*********************************** MUL BY SCALAR ***********************************/ ICICLE_DISPATCHER_INST(ScalarMulDispatcher, scalar_mul_vec, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_mul_vec)( - const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output) + const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig* config, scalar_t* output) { - return ScalarMulDispatcher::execute(scalar_a, vec_b, n, *config, output); + return ScalarMulDispatcher::execute(scalar_a, vec_b, size, use_single_scalar, *config, output); } template <> eIcicleError scalar_mul_vec( - const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output) + const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, scalar_mul_vec)(scalar_a, vec_b, n, &config, output); + return CONCAT_EXPAND(FIELD, scalar_mul_vec)(scalar_a, vec_b, size, use_single_scalar, &config, output); } /*********************************** CONVERT MONTGOMERY ***********************************/ @@ -234,16 +234,16 @@ namespace icicle { ICICLE_DISPATCHER_INST(ScalarConvertMontgomeryDispatcher, scalar_convert_montgomery, scalarConvertMontgomeryImpl) extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_convert_montgomery)( - const scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, scalar_t* output) + const scalar_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig* config, scalar_t* output) { - return ScalarConvertMontgomeryDispatcher::execute(input, size, is_into, *config, output); + return ScalarConvertMontgomeryDispatcher::execute(input, size, is_to_montgomery, *config, output); } template <> eIcicleError - convert_montgomery(const scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig& config, scalar_t* output) + convert_montgomery(const scalar_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, scalar_convert_montgomery)(input, size, is_into, &config, output); + return CONCAT_EXPAND(FIELD, scalar_convert_montgomery)(input, size, is_to_montgomery, &config, output); } #ifdef EXT_FIELD @@ -251,16 +251,16 @@ namespace icicle { ExtFieldConvertMontgomeryDispatcher, extension_scalar_convert_montgomery, extFieldConvertMontgomeryImpl) extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_scalar_convert_montgomery)( - const extension_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, extension_t* output) + const extension_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig* config, extension_t* output) { - return ExtFieldConvertMontgomeryDispatcher::execute(input, size, is_into, *config, output); + return ExtFieldConvertMontgomeryDispatcher::execute(input, size, is_to_montgomery, *config, output); } template <> eIcicleError convert_montgomery( - const extension_t* input, uint64_t size, bool is_into, const VecOpsConfig& config, extension_t* output) + const extension_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, extension_t* output) { - return CONCAT_EXPAND(FIELD, extension_scalar_convert_montgomery)(input, size, is_into, &config, output); + return CONCAT_EXPAND(FIELD, extension_scalar_convert_montgomery)(input, size, is_to_montgomery, &config, output); } #endif // EXT_FIELD @@ -304,11 +304,12 @@ namespace icicle { const scalar_t* input, uint64_t offset, uint64_t stride, - uint64_t size, + uint64_t size_in, + uint64_t size_out, const VecOpsConfig* config, scalar_t* output) { - return ScalarSliceDispatcher::execute(input, offset, stride, size, *config, output); + return ScalarSliceDispatcher::execute(input, offset, stride, size_in, size_out, *config, output); } template <> @@ -316,11 +317,12 @@ namespace icicle { const scalar_t* input, uint64_t offset, uint64_t stride, - uint64_t size, + uint64_t size_in, + uint64_t size_out, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, slice)(input, offset, stride, size, &config, output); + return CONCAT_EXPAND(FIELD, slice)(input, offset, stride, size_in, size_out, &config, output); } #ifdef EXT_FIELD @@ -350,7 +352,7 @@ namespace icicle { } #endif // EXT_FIELD - /*********************************** HIGHEST NON ZERO IDX ***********************************/ + /*********************************** HIGHEST sizeON ZERO IDX ***********************************/ ICICLE_DISPATCHER_INST(ScalarHighestNonZeroIdxDispatcher, highest_non_zero_idx, scalarHighNonZeroIdxOpImpl) @@ -399,8 +401,8 @@ namespace icicle { ICICLE_DISPATCHER_INST(ScalarPolyDivDispatcher, poly_division, scalarPolyDivImpl) extern "C" eIcicleError CONCAT_EXPAND(FIELD, poly_division)( - const scalar_t* numerator, - int64_t numerator_deg, + const scalar_t* sizeumerator, + int64_t sizeumerator_deg, const scalar_t* denumerator, int64_t denumerator_deg, const VecOpsConfig* config, @@ -410,13 +412,13 @@ namespace icicle { uint64_t r_size) { return ScalarPolyDivDispatcher::execute( - numerator, numerator_deg, denumerator, denumerator_deg, *config, q_out, q_size, r_out, r_size); + sizeumerator, sizeumerator_deg, denumerator, denumerator_deg, *config, q_out, q_size, r_out, r_size); } template <> eIcicleError polynomial_division( - const scalar_t* numerator, - int64_t numerator_deg, + const scalar_t* sizeumerator, + int64_t sizeumerator_deg, const scalar_t* denumerator, int64_t denumerator_deg, const VecOpsConfig& config, @@ -426,7 +428,7 @@ namespace icicle { uint64_t r_size) { return CONCAT_EXPAND(FIELD, poly_division)( - numerator, numerator_deg, denumerator, denumerator_deg, &config, q_out, q_size, r_out, r_size); + sizeumerator, sizeumerator_deg, denumerator, denumerator_deg, &config, q_out, q_size, r_out, r_size); } -} // namespace icicle \ No newline at end of file +} // sizeamespace icicle \ No newline at end of file From 1488732c21430ff2036eef8f83b5fe9ebac1d304 Mon Sep 17 00:00:00 2001 From: Shanie Winitz Date: Sat, 12 Oct 2024 19:27:19 +0300 Subject: [PATCH 08/43] vec_ops - added: config.batch, parallel transpose, tests --- icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 227 ++-- .../include/icicle/backend/vec_ops_backend.h | 191 ++-- icicle/include/icicle/fields/host_math.h | 2 +- .../default_backend/default_poly_backend.h | 10 +- icicle/include/icicle/vec_ops.h | 16 +- icicle/src/vec_ops.cpp | 28 +- icicle/tests/test_curve_api.cpp | 3 +- icicle/tests/test_field_api.cpp | 1000 ++++++++++++++--- 8 files changed, 1085 insertions(+), 392 deletions(-) diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index 952f5108f..a56cdc73c 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -20,16 +20,17 @@ enum VecOperation { VECTOR_SUB, VECTOR_MUL, VECTOR_DIV, - VECTOR_SUM, CONVERT_TO_MONTGOMERY, CONVERT_FROM_MONTGOMERY, + VECTOR_SUM, VECTOR_PRODUCT, SCALAR_ADD_VEC, SCALAR_SUB_VEC, SCALAR_MUL_VEC, BIT_REVERSE, SLICE, - REPLACE_ELEMENTS, + REPLACE_ELEMENTS, + OUT_OF_PLACE_MATRIX_TRANSPOSE, NOF_OPERATIONS }; @@ -80,8 +81,8 @@ class VectorOpTask : public TaskBase dispatch(); } - // Set the operands for bitrev operation and dispatch the task - void send_bitrev_task( + // Set the operands for bit_reverse operation and dispatch the task + void send_bit_reverse_task( VecOperation operation, uint32_t bit_size, uint64_t start_index, const uint32_t nof_operations, const T* op_a, const uint64_t stride, T* output) { m_operation = operation; @@ -121,9 +122,22 @@ class VectorOpTask : public TaskBase dispatch(); } + void send_out_of_place_matrix_transpose_task(VecOperation operation, const T* mat_in, const uint32_t nof_operations, const uint32_t nof_rows, const uint32_t nof_cols, const uint32_t stride, T* mat_out) + { + m_operation = operation; + m_op_a = mat_in; + m_nof_operations = nof_operations; + m_nof_rows = nof_rows; + m_nof_cols = nof_cols; + m_stride = stride; + m_output = mat_out; + dispatch(); + } // Execute the selected function based on m_operation - virtual void execute() { (this->*functionPtrs[static_cast(m_operation)])(); } + virtual void execute() { + (this->*functionPtrs[static_cast(m_operation)])(); + } private: // Single worker functionality to execute vector add (+) @@ -172,17 +186,17 @@ class VectorOpTask : public TaskBase // Single worker functionality to execute sum(vector) void vector_sum() { - m_intermidiate_res[m_idx_in_batch] = T::zero(); + m_intermidiate_res = T::zero(); for (uint64_t i = 0; i < (m_stop_index * m_stride); i = i + m_stride) { - m_intermidiate_res[m_idx_in_batch] = m_intermidiate_res[m_idx_in_batch] + m_op_a[i]; + m_intermidiate_res = m_intermidiate_res + m_op_a[i]; } } // Single worker functionality to execute product(vector) void vector_product() { - m_intermidiate_res[m_idx_in_batch] = T::one(); + m_intermidiate_res = T::one(); for (uint64_t i = 0; i < (m_stop_index * m_stride); i = i + m_stride) { - m_intermidiate_res[m_idx_in_batch] = m_intermidiate_res[m_idx_in_batch] * m_op_a[i]; + m_intermidiate_res = m_intermidiate_res * m_op_a[i]; } } // Single worker functionality to execute scalar + vector @@ -247,8 +261,6 @@ class VectorOpTask : public TaskBase while (shifted_idx >= mod) { shifted_idx = (shifted_idx & mod) + (shifted_idx >> total_bits); } - // If shifted_idx == mod, result should be 0 since mod % mod == 0 - if (shifted_idx == mod) shifted_idx = 0; //TODO SHANIE - check if redundant return shifted_idx; } @@ -260,15 +272,29 @@ class VectorOpTask : public TaskBase for (uint32_t i = 0; i < m_nof_operations; ++i) { uint64_t start_idx = (*m_start_indices_in_mat)[m_start_index + i]; uint64_t idx = start_idx; + T prev = m_op_a[m_stride * idx]; do { uint64_t shifted_idx = idx << m_log_nof_rows; uint64_t new_idx = mersenne_mod(shifted_idx, total_bits); - m_output[m_stride * new_idx] = m_op_a[m_stride * idx]; + T next = m_op_a[m_stride * new_idx]; + m_output[m_stride * new_idx] = prev; + prev = next; idx = new_idx; } while (idx != start_idx); } } + // Single worker functionality for out of palce matrix transpose + void out_of_place_transpose() + { + for (uint32_t k = 0; k < m_nof_operations; ++k) { + for (uint32_t j = 0; j < m_nof_cols; ++j) { + m_output[m_stride * (j * m_nof_rows + k)] = m_op_a[m_stride * (k * m_nof_cols + j)]; + } + } + } + + // An array of available function pointers arranged according to the VecOperation enum using FunctionPtr = void (VectorOpTask::*)(); @@ -286,25 +312,30 @@ class VectorOpTask : public TaskBase &VectorOpTask::scalar_mul_vec, // SCALAR_MUL_VEC, &VectorOpTask::bit_reverse, // BIT_REVERSE &VectorOpTask::slice, // SLICE - &VectorOpTask::replace_elements // REPLACE_ELEMENTS + &VectorOpTask::replace_elements, // REPLACE_ELEMENTS + &VectorOpTask::out_of_place_transpose // OUT_OF_PLACE_MATRIX_TRANSPOSE + + }; VecOperation m_operation; // the operation to execute uint32_t m_nof_operations; // number of operations to execute for this task const T* m_op_a; // pointer to operand A. Operand A is a vector, or metrix in case of replace_elements const T* m_op_b; // pointer to operand B. Operand B is a vector or scalar - uint64_t m_start_index; // index used in bitreverse operation - uint64_t m_stop_index; // index used in reduce operations + uint64_t m_start_index; // index used in bitreverse operation and out of place matrix transpose + uint64_t m_stop_index; // index used in reduce operations and out of place matrix transpose uint32_t m_bit_size; // use in bitrev operation uint64_t m_stride; // used to support column batch operations uint64_t m_stride_out; // used in slice operation T* m_output; // pointer to the output. Can be a vector, scalar pointer, or a matrix pointer in case of replace_elements uint32_t m_log_nof_rows; // log of the number of rows in the matrix, used in replace_elements uint32_t m_log_nof_cols; // log of the number of columns in the matrix, used in replace_elements + uint32_t m_nof_rows; // the number of rows in the matrix, used in out of place matrix transpose + uint32_t m_nof_cols; // the number of columns in the matrix, used in out of place matrix transpose const std::vector* m_start_indices_in_mat; // Indices used in replace_elements operations public: - T* m_intermidiate_res; // pointer to the output. Can be a vector or scalar pointer + T m_intermidiate_res; // pointer to the output. Can be a vector or scalar pointer uint64_t m_idx_in_batch; // index in the batch. Used in intermidiate res tasks }; // class VectorOpTask @@ -325,7 +356,7 @@ template eIcicleError cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { - TasksManager> task_manager(get_nof_workers(config)); + TasksManager> task_manager(get_nof_workers(config) - 1); const uint64_t total_nof_operations = size*config.batch_size; for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) { VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); @@ -340,7 +371,7 @@ template eIcicleError cpu_scalar_vector_op( VecOperation op, const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output) { - TasksManager> task_manager(get_nof_workers(config)); + TasksManager> task_manager(get_nof_workers(config) - 1); const uint64_t total_nof_operations = use_single_scalar? size*config.batch_size : size; const uint32_t stride = (!use_single_scalar && config.columns_batch)? config.batch_size : 1; for (uint32_t idx_in_batch = 0; idx_in_batch < (use_single_scalar? 1 : config.batch_size); idx_in_batch++) { @@ -416,15 +447,17 @@ template eIcicleError cpu_convert_montgomery( const Device& device, const T* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, T* output) { - TasksManager> task_manager(get_nof_workers(config)); + TasksManager> task_manager(get_nof_workers(config) - 1); const uint64_t total_nof_operations = size*config.batch_size; for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) { VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); task_p->send_1op_task( - is_to_montgomery ? CONVERT_TO_MONTGOMERY : CONVERT_FROM_MONTGOMERY, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), + (is_to_montgomery ? CONVERT_TO_MONTGOMERY : CONVERT_FROM_MONTGOMERY), std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), input + i, output + i); } task_manager.wait_done(); + for (uint64_t i = 0; i < size*config.batch_size; i++) { + } return eIcicleError::SUCCESS; } @@ -443,7 +476,7 @@ REGISTER_CONVERT_MONTGOMERY_EXT_FIELD_BACKEND("CPU", cpu_convert_montgomery eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output) { - TasksManager> task_manager(get_nof_workers(config)); + TasksManager> task_manager(get_nof_workers(config) - 1); std::vector output_initialized = std::vector(config.batch_size, false); uint64_t vec_a_offset = 0; uint64_t idx_in_batch = 0; @@ -454,7 +487,7 @@ eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t size, return eIcicleError::SUCCESS; } if (task_p->is_completed()) { - output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] ? output[task_p->m_idx_in_batch] + task_p->m_intermidiate_res[task_p->m_idx_in_batch] : task_p->m_intermidiate_res[task_p->m_idx_in_batch]; + output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] ? output[task_p->m_idx_in_batch] + task_p->m_intermidiate_res : task_p->m_intermidiate_res; output_initialized[task_p->m_idx_in_batch] = true; } if (vec_a_offset < size) { @@ -482,7 +515,7 @@ REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum); template eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output) { - TasksManager> task_manager(get_nof_workers(config)); + TasksManager> task_manager(get_nof_workers(config) - 1); std::vector output_initialized = std::vector(config.batch_size, false); uint64_t vec_a_offset = 0; uint64_t idx_in_batch = 0; @@ -493,7 +526,7 @@ eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t s return eIcicleError::SUCCESS; } if (task_p->is_completed()) { - output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] ? output[task_p->m_idx_in_batch] + task_p->m_intermidiate_res[task_p->m_idx_in_batch] : task_p->m_intermidiate_res[task_p->m_idx_in_batch]; + output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] ? output[task_p->m_idx_in_batch] * task_p->m_intermidiate_res : task_p->m_intermidiate_res; output_initialized[task_p->m_idx_in_batch] = true; } if (vec_a_offset < size) { @@ -548,41 +581,32 @@ eIcicleError cpu_scalar_mul( REGISTER_SCALAR_MUL_VEC_BACKEND("CPU", cpu_scalar_mul); /*********************************** TRANSPOSE ***********************************/ -// template todo shanie - remove -// eIcicleError cpu_matrix_transpose_basic( -// const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out) -// { -// ICICLE_ASSERT(mat_in && mat_out && nof_rows != 0 && nof_cols != 0) << "Invalid argument"; - -// // Perform the matrix transpose -// for (uint32_t i = 0; i < nof_rows; ++i) { -// for (uint32_t j = 0; j < nof_cols; ++j) { -// mat_out[j * nof_rows + i] = mat_in[i * nof_cols + j]; -// } -// } - -// return eIcicleError::SUCCESS; -// } template -eIcicleError cpu_matrix_transpose_batch( +eIcicleError out_of_place_matrix_transpose( const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out) { - const T* cur_mat_in = mat_in; - T* cur_mat_out = mat_out; + TasksManager> task_manager(get_nof_workers(config) - 1); uint32_t stride = config.columns_batch? config.batch_size : 1; - const uint64_t total_elements = static_cast(nof_rows) * nof_cols; + const uint64_t total_elements_one_mat = static_cast(nof_rows) * nof_cols; + const uint32_t NOF_ROWS_PER_TASK = std::min((uint64_t)nof_rows, std::max((uint64_t)(NOF_OPERATIONS_PER_TASK / nof_cols) , (uint64_t)1)); for (uint32_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { + const T* cur_mat_in = config.columns_batch? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements_one_mat; + T* cur_mat_out = config.columns_batch? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements_one_mat; // Perform the matrix transpose - for (uint32_t i = 0; i < nof_rows; ++i) { - for (uint32_t j = 0; j < nof_cols; ++j) { - cur_mat_out[stride*(j * nof_rows + i)] = cur_mat_in[stride*(i * nof_cols + j)]; - } + for (uint32_t i = 0; i < nof_rows; i += NOF_ROWS_PER_TASK) { + VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); + task_p->send_out_of_place_matrix_transpose_task( + OUT_OF_PLACE_MATRIX_TRANSPOSE, + cur_mat_in + stride*i*nof_cols, + std::min((uint64_t)NOF_ROWS_PER_TASK, (uint64_t)nof_rows - i), + nof_rows, + nof_cols, + stride, + cur_mat_out + (stride * i)); } - cur_mat_in += (config.columns_batch ? 1 : total_elements); - cur_mat_out += (config.columns_batch ? 1 : total_elements); } - + task_manager.wait_done(); return eIcicleError::SUCCESS; } @@ -595,20 +619,6 @@ uint32_t gcd(uint32_t a, uint32_t b) { return a; } -// template //TODO shanie - remove -// void replace_elements(uint32_t start_idx, uint32_t log_nof_rows, uint32_t log_nof_cols, const T* mat_in, T* mat_out) { -// uint64_t idx = start_idx; - -// while (true) { -// uint64_t new_idx = mersenne_mod(idx << log_nof_rows, log_nof_rows+log_nof_cols); // new_idx = (idx< void gen_necklace(uint32_t t, uint32_t p, uint32_t k, uint32_t length, std::vector& necklace, std::vector& task_indices) { @@ -620,9 +630,6 @@ void gen_necklace(uint32_t t, uint32_t p, uint32_t k, uint32_t length, std::vect start_idx += necklace[i] * multiplier; multiplier *= k; } - // for (int i = 1; i <= length; ++i) { // Compute start_idx as the decimal representation of the necklace //TODO SHANIE - remove - // start_idx = start_idx + necklace[i] * std::pow(k, length - i); - // } task_indices.push_back(start_idx); } return; @@ -638,53 +645,63 @@ void gen_necklace(uint32_t t, uint32_t p, uint32_t k, uint32_t length, std::vect } template -eIcicleError cpu_matrix_transpose_parallel( - const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out) -{ - ICICLE_ASSERT(mat_in && mat_out && nof_rows != 0 && nof_cols != 0) << "Invalid argument"; - - // check if the number of rows and columns are powers of 2, if not use the basic transpose - if ((nof_rows & (nof_rows - 1)) != 0 || (nof_cols & (nof_cols - 1)) != 0) { - cpu_matrix_transpose_batch(device, mat_in, nof_rows, nof_cols, config, mat_out); - return eIcicleError::SUCCESS; - } - +eIcicleError matrix_transpose_necklaces(const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out){ uint32_t log_nof_rows = static_cast(std::floor(std::log2(nof_rows))); uint32_t log_nof_cols = static_cast(std::floor(std::log2(nof_cols))); uint32_t gcd_value = gcd(log_nof_rows, log_nof_cols); uint32_t k = 1 << gcd_value; // Base of necklaces uint32_t length = (log_nof_cols + log_nof_rows) / gcd_value; // length of necklaces. Since all are powers of 2, equvalent to (log_nof_cols + log_nof_rows) / gcd_value; const uint64_t max_nof_operations = NOF_OPERATIONS_PER_TASK / length; - const uint64_t total_elements = static_cast(nof_rows) * nof_cols; + const uint64_t total_elements_one_mat = static_cast(nof_rows) * nof_cols; std::vector necklace(length + 1, 0); std::vector start_indices_in_mat; // Collect start indices gen_necklace(1, 1, k, length, necklace, start_indices_in_mat); - TasksManager> task_manager(get_nof_workers(config)); + TasksManager> task_manager(get_nof_workers(config) - 1); for (uint64_t i = 0; i < start_indices_in_mat.size(); i += max_nof_operations) { uint64_t nof_operations = std::min((uint64_t)max_nof_operations, start_indices_in_mat.size() - i); for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); task_p->send_replace_elements_task( REPLACE_ELEMENTS, - config.columns_batch? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements, + config.columns_batch? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements_one_mat, nof_operations, start_indices_in_mat, i, log_nof_rows, log_nof_cols, config.columns_batch? config.batch_size : 1, - config.columns_batch? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements); + config.columns_batch? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements_one_mat); } } task_manager.wait_done(); return eIcicleError::SUCCESS; } -REGISTER_MATRIX_TRANSPOSE_BACKEND("CPU", cpu_matrix_transpose_parallel); + +template +eIcicleError cpu_matrix_transpose( + const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out) +{ + ICICLE_ASSERT(mat_in && mat_out && nof_rows != 0 && nof_cols != 0) << "Invalid argument"; + + // check if the number of rows and columns are powers of 2, if not use the basic transpose + bool is_power_of_2 = (nof_rows & (nof_rows - 1)) == 0 && (nof_cols & (nof_cols - 1)) == 0; + bool is_inplace = mat_in == mat_out; + if (!is_inplace) { + return(out_of_place_matrix_transpose(device, mat_in, nof_rows, nof_cols, config, mat_out)); + } else if (is_power_of_2) { + return (matrix_transpose_necklaces(mat_in, nof_rows, nof_cols, config, mat_out)); + } else { + ICICLE_LOG_ERROR << "Matrix transpose is not supported for inplace non power of 2 rows and columns"; + return eIcicleError::INVALID_ARGUMENT; + } +} + +REGISTER_MATRIX_TRANSPOSE_BACKEND("CPU", cpu_matrix_transpose); #ifdef EXT_FIELD -REGISTER_MATRIX_TRANSPOSE_EXT_FIELD_BACKEND("CPU", cpu_matrix_transpose_parallel); +REGISTER_MATRIX_TRANSPOSE_EXT_FIELD_BACKEND("CPU", cpu_matrix_transpose); #endif // EXT_FIELD /*********************************** BIT REVERSE ***********************************/ @@ -698,12 +715,12 @@ cpu_bit_reverse(const Device& device, const T* vec_in, uint64_t size, const VecO ICICLE_ASSERT((1ULL << logn) == size) << "Invalid argument - size is not a power of 2"; // Perform the bit reverse - TasksManager> task_manager(get_nof_workers(config)); + TasksManager> task_manager(get_nof_workers(config) - 1); for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { for (uint64_t i = 0; i < size; i += NOF_OPERATIONS_PER_TASK) { VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); - task_p->send_bitrev_task( + task_p->send_bit_reverse_task( BIT_REVERSE, logn, i, @@ -739,7 +756,7 @@ eIcicleError cpu_slice( ICICLE_ASSERT(vec_in != nullptr && vec_out != nullptr) << "Error: Invalid argument - input or output vector is null"; ICICLE_ASSERT(offset + (size_out-1) * stride < size_in) << "Error: Invalid argument - slice out of bound"; - TasksManager> task_manager(get_nof_workers(config)); + TasksManager> task_manager(get_nof_workers(config) - 1); for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { for (uint64_t i = 0; i < size_out; i += NOF_OPERATIONS_PER_TASK) { VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); @@ -817,19 +834,19 @@ REGISTER_POLYNOMIAL_EVAL("CPU", cpu_poly_eval); /*============================== polynomial division ==============================*/ template -void school_book_division_step_cpu(T* r, T* q, const T* b, int deg_r, int deg_b, const T& lc_b_inv) +void school_book_division_step_cpu(T* r, T* q, const T* b, int deg_r, int deg_b, const T& lc_b_inv, uint32_t stride = 1) { int64_t monomial = deg_r - deg_b; // monomial=1 is 'x', monomial=2 is x^2 etc. - T lc_r = r[deg_r]; + T lc_r = r[deg_r * stride]; // leading coefficient of r T monomial_coeff = lc_r * lc_b_inv; // lc_r / lc_b // adding monomial s to q (q=q+s) - q[monomial] = monomial_coeff; + q[monomial * stride] = monomial_coeff; for (int i = monomial; i <= deg_r; ++i) { - T b_coeff = b[i - monomial]; - r[i] = r[i] - monomial_coeff * b_coeff; + T b_coeff = b[(i - monomial) * stride]; + r[i * stride] = r[i * stride] - monomial_coeff * b_coeff; } } @@ -840,33 +857,37 @@ eIcicleError cpu_poly_divide( int64_t numerator_deg, const T* denumerator, int64_t denumerator_deg, + uint64_t q_size, + uint64_t r_size, const VecOpsConfig& config, T* q_out /*OUT*/, - uint64_t q_size, - T* r_out /*OUT*/, - uint64_t r_size) + T* r_out /*OUT*/) { ICICLE_ASSERT(r_size >= numerator_deg) << "polynomial division expects r(x) size to be similar to numerator size and higher than numerator degree(x)"; ICICLE_ASSERT(q_size >= (numerator_deg - denumerator_deg + 1)) << "polynomial division expects q(x) size to be at least deg(numerator)-deg(denumerator)+1"; - ICICLE_CHECK(icicle_copy_async(r_out, numerator, r_size * config.batch_size * sizeof(T), config.stream)); + // ICICLE_CHECK(icicle_copy_async(r_out, numerator, r_size * config.batch_size * sizeof(T), config.stream)); + // copy numerator to r_out // FIXME should it be copied using icicle_copy_async? + for (uint64_t i = 0; i < (numerator_deg+1)*config.batch_size; ++i) { + r_out[i] = numerator[i]; + } + uint32_t stride = config.columns_batch? config.batch_size : 1; + auto deg_r = std::make_unique(config.batch_size); for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) { const T* curr_denumerator = config.columns_batch? denumerator + idx_in_batch : denumerator + idx_in_batch * (denumerator_deg+1); // Pointer to the current vector T* curr_q_out = config.columns_batch? q_out + idx_in_batch : q_out + idx_in_batch * q_size; // Pointer to the current vector T* curr_r_out = config.columns_batch? r_out + idx_in_batch : r_out + idx_in_batch * r_size; // Pointer to the current vector // invert largest coeff of b - const T& lc_b_inv = T::inverse(curr_denumerator[denumerator_deg]); - int64_t deg_r = numerator_deg; - while (deg_r >= denumerator_deg) { + const T& lc_b_inv = T::inverse(curr_denumerator[denumerator_deg * stride]); + deg_r[idx_in_batch] = numerator_deg; + while (deg_r[idx_in_batch] >= denumerator_deg) { // each iteration is removing the largest monomial in r until deg(r); + using vectorVectorOpImplInplaceA = std::function; + + using scalarConvertMontgomeryImpl = std::function; + using VectorReduceOpImpl = std::function; using scalarVectorOpImpl = std::function; + using scalarMatrixOpImpl = std::function; - using vectorVectorOpImpl = std::function; - using vectorVectorOpImplInplaceA = std::function; + using scalarSliceOpImpl = std::function; - void register_vector_sum(const std::string& deviceType, scalarVectorReduceOpImpl impl); + using scalarHighNonZeroIdxOpImpl = std::function; -#define REGISTER_VECTOR_SUM_BACKEND(DEVICE_TYPE, FUNC) \ - namespace { \ - static bool UNIQUE(_reg_vec_sum) = []() -> bool { \ - register_vector_sum(DEVICE_TYPE, FUNC); \ - return true; \ - }(); \ - } + using scalarPolyEvalImpl = std::function; - void register_vector_product(const std::string& deviceType, scalarVectorReduceOpImpl impl); + using scalarPolyDivImpl = std::function; -#define REGISTER_VECTOR_PRODUCT_BACKEND(DEVICE_TYPE, FUNC) \ - namespace { \ - static bool UNIQUE(_reg_vec_product) = []() -> bool { \ - register_vector_product(DEVICE_TYPE, FUNC); \ - return true; \ - }(); \ - } @@ -108,6 +151,36 @@ namespace icicle { }(); \ } + void register_scalar_convert_montgomery(const std::string& deviceType, scalarConvertMontgomeryImpl); + +#define REGISTER_CONVERT_MONTGOMERY_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_scalar_convert_mont) = []() -> bool { \ + register_scalar_convert_montgomery(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + + void register_vector_sum(const std::string& deviceType, VectorReduceOpImpl impl); + +#define REGISTER_VECTOR_SUM_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_vec_sum) = []() -> bool { \ + register_vector_sum(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + + void register_vector_product(const std::string& deviceType, VectorReduceOpImpl impl); + +#define REGISTER_VECTOR_PRODUCT_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_vec_product) = []() -> bool { \ + register_vector_product(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + void register_scalar_mul_vec(const std::string& deviceType, scalarVectorOpImpl impl); #define REGISTER_SCALAR_MUL_VEC_BACKEND(DEVICE_TYPE, FUNC) \ @@ -138,32 +211,6 @@ namespace icicle { }(); \ } - using scalarConvertMontgomeryImpl = std::function; - - void register_scalar_convert_montgomery(const std::string& deviceType, scalarConvertMontgomeryImpl); - -#define REGISTER_CONVERT_MONTGOMERY_BACKEND(DEVICE_TYPE, FUNC) \ - namespace { \ - static bool UNIQUE(_reg_scalar_convert_mont) = []() -> bool { \ - register_scalar_convert_montgomery(DEVICE_TYPE, FUNC); \ - return true; \ - }(); \ - } - - using scalarMatrixOpImpl = std::function; - void register_matrix_transpose(const std::string& deviceType, scalarMatrixOpImpl impl); #define REGISTER_MATRIX_TRANSPOSE_BACKEND(DEVICE_TYPE, FUNC) \ @@ -174,9 +221,6 @@ namespace icicle { }(); \ } - using scalarBitReverseOpImpl = std::function; - void register_scalar_bit_reverse(const std::string& deviceType, scalarBitReverseOpImpl); #define REGISTER_BIT_REVERSE_BACKEND(DEVICE_TYPE, FUNC) \ @@ -187,16 +231,6 @@ namespace icicle { }(); \ } - using scalarSliceOpImpl = std::function; - void register_slice(const std::string& deviceType, scalarSliceOpImpl); #define REGISTER_SLICE_BACKEND(DEVICE_TYPE, FUNC) \ @@ -207,9 +241,6 @@ namespace icicle { }(); \ } - using scalarHighNonZeroIdxOpImpl = std::function; - void register_highest_non_zero_idx(const std::string& deviceType, scalarHighNonZeroIdxOpImpl); #define REGISTER_HIGHEST_NON_ZERO_IDX_BACKEND(DEVICE_TYPE, FUNC) \ @@ -220,24 +251,6 @@ namespace icicle { }(); \ } - template - eIcicleError polynomial_eval( - const T* coeffs, - uint64_t coeffs_size, - const T* domain, - uint64_t domain_size, - const VecOpsConfig& config, - T* evals /*OUT*/); - - using scalarPolyEvalImpl = std::function; - void register_poly_eval(const std::string& deviceType, scalarPolyEvalImpl); #define REGISTER_POLYNOMIAL_EVAL(DEVICE_TYPE, FUNC) \ @@ -248,18 +261,6 @@ namespace icicle { }(); \ } - using scalarPolyDivImpl = std::function; - void register_poly_division(const std::string& deviceType, scalarPolyDivImpl); #define REGISTER_POLYNOMIAL_DIVISION(DEVICE_TYPE, FUNC) \ diff --git a/icicle/include/icicle/fields/host_math.h b/icicle/include/icicle/fields/host_math.h index e256aa922..9ced242d3 100644 --- a/icicle/include/icicle/fields/host_math.h +++ b/icicle/include/icicle/fields/host_math.h @@ -288,7 +288,7 @@ namespace host_math { r = left_shift(r); r.limbs[0] |= ((num.limbs[limb_idx] >> bit_idx) & 1); uint32_t c = add_sub_limbs(r, denom, temp); - if (limb_idx < NLIMBS_Q & !c) { + if ((limb_idx < NLIMBS_Q) & !c) { r = temp; q.limbs[limb_idx] |= 1 << bit_idx; } diff --git a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h index 7c0cca845..0ee0e2d0f 100644 --- a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h +++ b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h @@ -278,7 +278,15 @@ namespace icicle { config.is_result_on_device = true; ICICLE_CHECK(icicle::polynomial_division( - a_coeffs, deg_a, b_coeffs, deg_b, config, Q_coeffs, deg_a - deg_b + 1, R_coeffs, a_N)); + a_coeffs, + deg_a, + b_coeffs, + deg_b, + deg_a - deg_b + 1, + a_N, + config, + Q_coeffs, + R_coeffs)); } void quotient(PolyContext Q, PolyContext op_a, PolyContext op_b) override diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h index 42dfca8bd..b89327eb4 100644 --- a/icicle/include/icicle/vec_ops.h +++ b/icicle/include/icicle/vec_ops.h @@ -295,6 +295,7 @@ namespace icicle { * @return eIcicleError Error code indicating success or failure. * @note The input matrices are assumed to be stored in row-major order. * This function transposes an input matrix or a batch of matrices. + * Matrix transpose inplace is not supported for non-power of 2 rows and columns. */ template eIcicleError @@ -312,7 +313,7 @@ namespace icicle { * - The layout depends on `config.columns_batch`: * - If `false`, vectors are stored contiguously. * - If `true`, vectors are stored as columns in a 2D array. - * @param size Number of elements in each vector. + * @param size Number of elements in each vector. Must be a power of 2. * @param config Configuration for the operation. * @param vec_out Pointer to the output vector(s) where the results will be stored. * The output array should have the same storage layout as the input vectors. @@ -337,6 +338,7 @@ namespace icicle { * @return eIcicleError Error code indicating success or failure. * @note The total input size is `size_in * config.batch_size`. * The total output size is `size_out * config.batch_size`. + * parameters must satisfy: offset + (size_out-1) * stride < size_in */ template eIcicleError @@ -350,7 +352,7 @@ namespace icicle { * @param size Number of elements in each input vector. * @param config Configuration for the operation. * @param out_idx Pointer to an array where the output indices of the highest non-zero element in each input vector will be stored. - * The array should have a length of at least `config.batch_size`. + * The array should have a length of `config.batch_size`. * @return eIcicleError Error code indicating success or failure. */ template @@ -398,13 +400,13 @@ namespace icicle { * - Storage layout is similar to `numerator`. * @param denominator_deg Degree of the denominator polynomial. * @param config Configuration for the operation. + * @param q_size Size of the quotient array for one polynomial. + * @param r_size Size of the remainder array. * @param q_out Pointer to the array where the quotient polynomial(s) will be stored. This is an output parameter. * - The storage layout should match that of `numerator`. - * @param q_size Size of the quotient array for one polynomial. * @param r_out Pointer to the array where the remainder polynomial(s) will be stored. This is an output parameter. * - The storage layout should match that of `numerator`. * - The size of `r_out` should be sufficient to hold the remainder coefficients for each polynomial. - * @param r_size Size of the remainder array. * @return eIcicleError Error code indicating success or failure. * * @note The degrees should satisfy `numerator_deg >= denominator_deg`. @@ -417,10 +419,10 @@ namespace icicle { int64_t numerator_deg, const T* denumerator, int64_t denumerator_deg, + uint64_t q_size, + uint64_t r_size, const VecOpsConfig& config, T* q_out /*OUT*/, - uint64_t q_size, - T* r_out /*OUT*/, - uint64_t r_size); + T* r_out /*OUT*/); } // namespace icicle \ No newline at end of file diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp index db86e6e73..2c16ed389 100644 --- a/icicle/src/vec_ops.cpp +++ b/icicle/src/vec_ops.cpp @@ -5,7 +5,7 @@ namespace icicle { /*********************************** REDUCE PRODUCT ************************/ - ICICLE_DISPATCHER_INST(VectorProductDispatcher, vector_product, scalarVectorReduceOpImpl); + ICICLE_DISPATCHER_INST(VectorProductDispatcher, vector_product, VectorReduceOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_product)( const scalar_t* vec_a, uint64_t size, const VecOpsConfig* config, scalar_t* output) @@ -21,7 +21,7 @@ namespace icicle { } /*********************************** REDUCE SUM ****************************/ - ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, scalarVectorReduceOpImpl ); + ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, VectorReduceOpImpl ); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_sum)( const scalar_t* vec_a, uint64_t size, const VecOpsConfig* config, scalar_t* output) @@ -401,34 +401,34 @@ namespace icicle { ICICLE_DISPATCHER_INST(ScalarPolyDivDispatcher, poly_division, scalarPolyDivImpl) extern "C" eIcicleError CONCAT_EXPAND(FIELD, poly_division)( - const scalar_t* sizeumerator, - int64_t sizeumerator_deg, + const scalar_t* numerator, + int64_t numerator_deg, const scalar_t* denumerator, int64_t denumerator_deg, + uint64_t q_size, + uint64_t r_size, const VecOpsConfig* config, scalar_t* q_out /*OUT*/, - uint64_t q_size, - scalar_t* r_out /*OUT*/, - uint64_t r_size) + scalar_t* r_out /*OUT*/) { return ScalarPolyDivDispatcher::execute( - sizeumerator, sizeumerator_deg, denumerator, denumerator_deg, *config, q_out, q_size, r_out, r_size); + numerator, numerator_deg, denumerator, denumerator_deg, q_size, r_size, *config, q_out, r_out); } template <> eIcicleError polynomial_division( - const scalar_t* sizeumerator, - int64_t sizeumerator_deg, + const scalar_t* numerator, + int64_t numerator_deg, const scalar_t* denumerator, int64_t denumerator_deg, + uint64_t q_size, + uint64_t r_size, const VecOpsConfig& config, scalar_t* q_out /*OUT*/, - uint64_t q_size, - scalar_t* r_out /*OUT*/, - uint64_t r_size) + scalar_t* r_out /*OUT*/) { return CONCAT_EXPAND(FIELD, poly_division)( - sizeumerator, sizeumerator_deg, denumerator, denumerator_deg, &config, q_out, q_size, r_out, r_size); + numerator, numerator_deg, denumerator, denumerator_deg, q_size, r_size, &config, q_out, r_out); } } // sizeamespace icicle \ No newline at end of file diff --git a/icicle/tests/test_curve_api.cpp b/icicle/tests/test_curve_api.cpp index 0769df7f9..9fe8bbe67 100644 --- a/icicle/tests/test_curve_api.cpp +++ b/icicle/tests/test_curve_api.cpp @@ -190,8 +190,7 @@ TEST_F(CurveApiTest, ecntt) run(s_main_target, out_main.get(), "ecntt", VERBOSE /*=measure*/, 1 /*=iters*/); run(s_ref_target, out_ref.get(), "ecntt", VERBOSE /*=measure*/, 1 /*=iters*/); - // ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), N * sizeof(projective_t))); // TODO ucomment when CPU is - // implemented + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), N * sizeof(projective_t))); } #endif // ECNTT diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 9743c6d2d..5aa9dd973 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -1,7 +1,11 @@ +#include #include #include #include "dlfcn.h" +#include #include +#include // For system + #include "icicle/runtime.h" #include "icicle/vec_ops.h" @@ -22,9 +26,11 @@ using FpMicroseconds = std::chrono::duration class FieldApiTest : public ::testing::Test @@ -38,9 +44,8 @@ class FieldApiTest : public ::testing::Test #endif icicle_load_backend_from_env_or_default(); - const bool is_cuda_registered = is_device_registered("CUDA"); - if (!is_cuda_registered) { ICICLE_LOG_ERROR << "CUDA device not found. Testing CPU vs CPU"; } - s_main_target = is_cuda_registered ? "CUDA" : "CPU"; + if (!s_is_cuda_registered) { ICICLE_LOG_ERROR << "CUDA device not found. Testing CPU vs reference (on cpu)"; } + s_main_target = s_is_cuda_registered ? "CUDA" : "CPU"; s_reference_target = "CPU"; } static void TearDownTestSuite() @@ -84,16 +89,20 @@ TYPED_TEST(FieldApiTest, FieldSanityTest) ASSERT_EQ(a * scalar_t::from(2), a + a); } -TYPED_TEST(FieldApiTest, vectorOps) -{ - const uint64_t N = 1 << 22; - auto in_a = std::make_unique(N); - auto in_b = std::make_unique(N); - FieldApiTest::random_samples(in_a.get(), N); - FieldApiTest::random_samples(in_b.get(), N); - auto out_main = std::make_unique(N); - auto out_ref = std::make_unique(N); +TYPED_TEST(FieldApiTest, vectorVectorOps) +{ + int seed = time(0); + srand(seed); + ICICLE_LOG_DEBUG << "seed = " << seed; + const uint64_t N = 1 << (rand() % 15 + 3); + const int batch_size = 1 << (rand() % 5); + const bool columns_batch = rand() % 2; + const int total_size = N * batch_size; + auto in_a = std::make_unique(total_size); + auto in_b = std::make_unique(total_size); + auto out_main = std::make_unique(total_size); + auto out_ref = std::make_unique(total_size); auto vector_accumulate_wrapper = [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) { @@ -105,6 +114,8 @@ TYPED_TEST(FieldApiTest, vectorOps) Device dev = {dev_type, 0}; icicle_set_device(dev); auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; std::ostringstream oss; oss << dev_type << " " << msg; @@ -116,45 +127,307 @@ TYPED_TEST(FieldApiTest, vectorOps) END_TIMER(VECADD_sync, oss.str().c_str(), measure); }; + // warmup // run(s_reference_target, out_ref.get(), false /*=measure*/, 16 /*=iters*/); // run(s_main_target, out_main.get(), false /*=measure*/, 1 /*=iters*/); + + // warmup + // run(s_reference_target, out_ref.get(), false /*=measure*/, 16 /*=iters*/); + // run(s_main_target, out_main.get(), false /*=measure*/, 1 /*=iters*/); + + // Element-wise vector operations + // If config.batch_size>1, (columns_batch=true or false) the operation is done element-wise anyway, so it doesn't affect the test - // accumulate - auto temp_result = std::make_unique(N); - auto initial_in_a = std::make_unique(N); - - std::memcpy(initial_in_a.get(), in_a.get(), N * sizeof(TypeParam)); - run(s_reference_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS); - std::memcpy(temp_result.get(), in_a.get(), N * sizeof(TypeParam)); - std::memcpy(in_a.get(), initial_in_a.get(), N * sizeof(TypeParam)); - run(s_main_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS); - ASSERT_EQ(0, memcmp(in_a.get(), temp_result.get(), N * sizeof(TypeParam))); - - // add - run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_add, "vector add", ITERS); + // // add + FieldApiTest::random_samples(in_a.get(),total_size); + FieldApiTest::random_samples(in_b.get(),total_size); + if (!s_is_cuda_registered) { + for (int i = 0; i < total_size; i++) { out_ref[i] = in_a[i] + in_b[i]; } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_add, "vector add", ITERS); + } run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_add, "vector add", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), N * sizeof(TypeParam))); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); + + // // accumulate + FieldApiTest::random_samples(in_a.get(),total_size); + FieldApiTest::random_samples(in_b.get(),total_size); + if (!s_is_cuda_registered) { + for (int i = 0; i < total_size; i++) { out_ref[i] = in_a[i] + in_b[i]; } + } else { + run(s_reference_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS); + } + run(s_main_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS); + ASSERT_EQ(0, memcmp(in_a.get(), out_ref.get(), total_size * sizeof(TypeParam))); - // sub - run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sub, "vector sub", ITERS); + // // sub + FieldApiTest::random_samples(in_a.get(),total_size); + FieldApiTest::random_samples(in_b.get(),total_size); + if (!s_is_cuda_registered) { + for (int i = 0; i < total_size; i++) { out_ref[i] = in_a[i] - in_b[i]; } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sub, "vector sub", ITERS); + } run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sub, "vector sub", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), N * sizeof(TypeParam))); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); - // mul - run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_mul, "vector mul", ITERS); + // // mul + FieldApiTest::random_samples(in_a.get(),total_size); + FieldApiTest::random_samples(in_b.get(),total_size); + if (!s_is_cuda_registered) { + for (int i = 0; i < total_size; i++) { out_ref[i] = in_a[i] * in_b[i]; } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_mul, "vector mul", ITERS); + } run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_mul, "vector mul", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), N * sizeof(TypeParam))); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); + + // // div + FieldApiTest::random_samples(in_a.get(),total_size); + FieldApiTest::random_samples(in_b.get(),total_size); + // reference + if (!s_is_cuda_registered) { + for (int i = 0; i < total_size; i++) { out_ref[i] = in_a[i] * TypeParam::inverse(in_b[i]); } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_div, "vector div", ITERS); + } + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_div, "vector div", ITERS); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); + } -TYPED_TEST(FieldApiTest, matrixAPIsAsync) +TYPED_TEST(FieldApiTest, montgomeryConversion) { - const int R = 1 << 10, C = 1 << 8; - auto h_in = std::make_unique(R * C); - FieldApiTest::random_samples(h_in.get(), R * C); + int seed = time(0); + srand(seed); + // ICICLE_LOG_DEBUG << "seed = " << seed; + const uint64_t N = 1 << (rand() % 15 + 3); + const int batch_size = 1 << (rand() % 5); + const bool columns_batch = rand() % 2; + const bool is_to_montgomery = rand() % 2; + const int total_size = N * batch_size; + auto in_a = std::make_unique(total_size); + auto out_main = std::make_unique(total_size); + auto out_ref = std::make_unique(total_size); + + auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) { + Device dev = {dev_type, 0}; + icicle_set_device(dev); + auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; + + std::ostringstream oss; + oss << dev_type << " " << msg; + + START_TIMER(MONTGOMERY) + for (int i = 0; i < iters; ++i) { + ICICLE_CHECK(convert_montgomery(in_a.get(), N, is_to_montgomery, config, out)); + } + END_TIMER(MONTGOMERY, oss.str().c_str(), measure); + }; + + // Element-wise operation + // If config.batch_size>1, (columns_batch=true or false) the addition is done element-wise anyway, so it doesn't affect the test + + // convert_montgomery + FieldApiTest::random_samples(in_a.get(),total_size); + // reference + if (!s_is_cuda_registered) { + if (is_to_montgomery) { for (int i = 0; i < total_size; i++) { out_ref[i] = TypeParam::to_montgomery(in_a[i]); } } + else { for (int i = 0; i < total_size; i++) { out_ref[i] = TypeParam::from_montgomery(in_a[i]); } } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "montgomery", ITERS); + } + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "montgomery", ITERS); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); +} - auto h_out_main = std::make_unique(R * C); - auto h_out_ref = std::make_unique(R * C); + +TYPED_TEST(FieldApiTest, VectorReduceOps) +{ + int seed = time(0); + srand(seed); + // ICICLE_LOG_DEBUG << "seed = " << seed; + const uint64_t N = 1 << (rand() % 15 + 3); + const int batch_size = 1 << (rand() % 5); + const bool columns_batch = rand() % 2; + const int total_size = N * batch_size; + auto in_a = std::make_unique(total_size); + auto out_main = std::make_unique(batch_size); + auto out_ref = std::make_unique(batch_size); + + auto vector_accumulate_wrapper = + [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) { + return vector_accumulate(a, b, size, config); + }; + + auto run = + [&](const std::string& dev_type, TypeParam* out, bool measure, auto vec_op_func, const char* msg, int iters) { + Device dev = {dev_type, 0}; + icicle_set_device(dev); + auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; + + std::ostringstream oss; + oss << dev_type << " " << msg; + + START_TIMER(VECADD_sync) + for (int i = 0; i < iters; ++i) { + ICICLE_CHECK(vec_op_func(in_a.get(), N, config, out)); + } + END_TIMER(VECADD_sync, oss.str().c_str(), measure); + }; + + // // sum + FieldApiTest::random_samples(in_a.get(),total_size); + // reference + for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + out_ref[idx_in_batch] = TypeParam::from(0); + } + if (!s_is_cuda_registered) { + for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) { + uint64_t idx_a = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N; + out_ref[idx_in_batch] = out_ref[idx_in_batch] + in_a[idx_a]; + } + } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sum, "vector sum", ITERS); + } + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sum, "vector sum", ITERS); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(TypeParam))); + + + // // product + FieldApiTest::random_samples(in_a.get(),total_size); + if (!s_is_cuda_registered) { + for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + out_ref[idx_in_batch] = TypeParam::from(1); + } + for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) { + uint64_t idx_a = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N; + out_ref[idx_in_batch] = out_ref[idx_in_batch]*in_a[idx_a]; + } + } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_product, "vector product", ITERS); + } + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_product, "vector product", ITERS); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(TypeParam))); +} + +TYPED_TEST(FieldApiTest, scalarVectorOps) +{ + int seed = time(0); + srand(seed); + // ICICLE_LOG_DEBUG << "seed = " << seed; + const uint64_t N = 1 << (rand() % 15 + 3); + const int batch_size = 1 << (rand() % 5); + const bool columns_batch = rand() % 2; + const bool use_single_scalar = rand() % 2; + const int total_size = N * batch_size; + auto scalar_a = std::make_unique(use_single_scalar? 1 : batch_size); + auto in_b = std::make_unique(total_size); + auto out_main = std::make_unique(total_size); + auto out_ref = std::make_unique(total_size); + + auto vector_accumulate_wrapper = + [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) { + return vector_accumulate(a, b, size, config); + }; + + auto run = + [&](const std::string& dev_type, TypeParam* out, bool measure, auto vec_op_func, const char* msg, int iters) { + Device dev = {dev_type, 0}; + icicle_set_device(dev); + auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; + + std::ostringstream oss; + oss << dev_type << " " << msg; + + START_TIMER(VECADD_sync) + for (int i = 0; i < iters; ++i) { + ICICLE_CHECK(vec_op_func(scalar_a.get(), in_b.get(), N, use_single_scalar, config, out)); + } + END_TIMER(VECADD_sync, oss.str().c_str(), measure); + }; + + // // scalar add vec + FieldApiTest::random_samples(scalar_a.get(),(use_single_scalar? 1: batch_size)); + FieldApiTest::random_samples(in_b.get(),total_size); + + // reference + if (!s_is_cuda_registered) { + for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) { + uint64_t idx_b = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N; + out_ref[idx_b] = (use_single_scalar? scalar_a[0] : scalar_a[idx_in_batch]) + in_b[idx_b]; + } + } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_add_vec, "scalar add vec", ITERS); + } + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_add_vec, "scalar add vec", ITERS); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); + + + // // scalar sub vec + FieldApiTest::random_samples(scalar_a.get(),(use_single_scalar? 1: batch_size)); + FieldApiTest::random_samples(in_b.get(),total_size); + + if (!s_is_cuda_registered) { + for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) { + uint64_t idx_b = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N; + out_ref[idx_b] = (use_single_scalar? scalar_a[0] : scalar_a[idx_in_batch]) - in_b[idx_b]; + } + } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_sub_vec, "scalar sub vec", ITERS); + } + + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_sub_vec, "scalar sub vec", ITERS); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); + + // // scalar mul vec + FieldApiTest::random_samples(scalar_a.get(),(use_single_scalar? 1: batch_size)); + FieldApiTest::random_samples(in_b.get(),total_size); + + if (!s_is_cuda_registered) { + for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) { + uint64_t idx_b = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N; + out_ref[idx_b] = (use_single_scalar? scalar_a[0] : scalar_a[idx_in_batch]) * in_b[idx_b]; + } + } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_mul_vec, "scalar mul vec", ITERS); + } + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_mul_vec, "scalar mul vec", ITERS); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); +} + +TYPED_TEST(FieldApiTest, matrixAPIsAsync) +{ + int seed = time(0); + srand(seed); + // ICICLE_LOG_DEBUG << "seed = " << seed; + const int R = 1 << (rand() % 8 + 2); // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2 + const int C = 1 << (rand() % 8 + 2); // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2 + const int batch_size = 1 << (rand() % 4); + const bool columns_batch = rand() % 2; + const bool is_in_place = rand() % 2; + // ICICLE_LOG_DEBUG << "R = " << R << ", C = " << C << ", batch_size = " << batch_size << ", columns_batch = " << columns_batch << ", is_in_place = " << is_in_place; //TODO SHANIE - remove this + const int total_size = R * C * batch_size; + auto h_inout = std::make_unique(total_size); + auto h_out_main = std::make_unique(total_size); + auto h_out_ref = std::make_unique(total_size); auto run = [&](const std::string& dev_type, TypeParam* h_out, bool measure, const char* msg, int iters) { Device dev = {dev_type, 0}; @@ -163,6 +436,8 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync) DeviceProperties device_props; icicle_get_device_properties(device_props); auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; std::ostringstream oss; oss << dev_type << " " << msg; @@ -174,14 +449,14 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync) icicle_create_stream(&config.stream); icicle_malloc_async((void**)&d_in, R * C * sizeof(TypeParam), config.stream); icicle_malloc_async((void**)&d_out, R * C * sizeof(TypeParam), config.stream); - icicle_copy_to_device_async(d_in, h_in.get(), R * C * sizeof(TypeParam), config.stream); + icicle_copy_to_device_async(d_in, h_inout.get(), R * C * sizeof(TypeParam), config.stream); config.is_a_on_device = true; config.is_result_on_device = true; config.is_async = false; } - TypeParam* in = device_props.using_host_memory ? h_in.get() : d_in; + TypeParam* in = device_props.using_host_memory ? h_inout.get() : d_in; TypeParam* out = device_props.using_host_memory ? h_out : d_out; START_TIMER(TRANSPOSE) @@ -198,190 +473,577 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync) } }; - run(s_reference_target, h_out_ref.get(), VERBOSE /*=measure*/, "transpose", ITERS); - run(s_main_target, h_out_main.get(), VERBOSE /*=measure*/, "transpose", ITERS); - ASSERT_EQ(0, memcmp(h_out_main.get(), h_out_ref.get(), R * C * sizeof(TypeParam))); + // // Option 1: Initialize each input matrix in the batch with the same ascending values + // for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + // for (uint32_t i = 0; i < R * C; i++) { + // if(columns_batch){ + // h_inout[idx_in_batch + batch_size * i] = TypeParam::from(i); + // } else { + // h_inout[idx_in_batch * R * C + i] = TypeParam::from(i); + // } + // } + // } + + // // Option 2: Initialize the entire input array with ascending values + // for (int i = 0; i < total_size; i++) { + // h_inout[i] = TypeParam::from(i); + // } + + // Option 3: Initialize the entire input array with random values + FieldApiTest::random_samples(h_inout.get(),total_size); + + // Reference implementation + if (!s_is_cuda_registered) { + const TypeParam* cur_mat_in = h_inout.get(); + TypeParam* cur_mat_out = h_out_ref.get(); + uint32_t stride = columns_batch? batch_size : 1; + const uint64_t total_elements_one_mat = static_cast(R) * C; + for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + // Perform the matrix transpose + for (uint32_t i = 0; i < R; ++i) { + for (uint32_t j = 0; j < C; ++j) { + cur_mat_out[stride*(j * R + i)] = cur_mat_in[stride*(i * C + j)]; + } + } + cur_mat_in += (columns_batch ? 1 : total_elements_one_mat); + cur_mat_out += (columns_batch ? 1 : total_elements_one_mat); + } + } else { + run(s_reference_target, (is_in_place? h_inout.get() : h_out_ref.get()), VERBOSE /*=measure*/, "transpose", ITERS); + } + + run(s_main_target, (is_in_place? h_inout.get() : h_out_main.get()), VERBOSE /*=measure*/, "transpose", ITERS); + if (is_in_place) { + ASSERT_EQ(0, memcmp(h_inout.get(), h_out_ref.get(), total_size * sizeof(TypeParam))); + } else { + // std::cout << "h_out_main:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << h_out_main[i] << ", "; } std::cout <(N); - auto elements_ref = std::make_unique(N); - FieldApiTest::random_samples(elements_main.get(), N); - memcpy(elements_ref.get(), elements_main.get(), N * sizeof(TypeParam)); + int seed = time(0); + srand(seed); + // ICICLE_LOG_DEBUG << "seed = " << seed; + const uint64_t N = 1 << (rand() % 15 + 3); + const int batch_size = 1 << (rand() % 5); + const bool columns_batch = rand() % 2; + const bool is_in_place = rand() % 2; + const int total_size = N * batch_size; + + // const uint64_t N = 1 << (2); + // const int batch_size = 1 << (1); + // const bool columns_batch = true; + // const bool is_in_place = true; + // const int total_size = N * batch_size; + + auto in_a = std::make_unique(total_size); + auto out_main = std::make_unique(total_size); + auto out_ref = std::make_unique(total_size); - auto run = [&](const std::string& dev_type, TypeParam* inout, bool measure, const char* msg, int iters) { + auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) { Device dev = {dev_type, 0}; icicle_set_device(dev); auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; std::ostringstream oss; oss << dev_type << " " << msg; - START_TIMER(MONTGOMERY) + START_TIMER(BIT_REVERSE) for (int i = 0; i < iters; ++i) { - ICICLE_CHECK(convert_montgomery(inout, N, true /*into montgomery*/, config, inout)); + ICICLE_CHECK(bit_reverse(in_a.get(), N, config, out)); } - END_TIMER(MONTGOMERY, oss.str().c_str(), measure); + END_TIMER(BIT_REVERSE, oss.str().c_str(), measure); }; - run(s_reference_target, elements_main.get(), VERBOSE /*=measure*/, "montgomery", 1); - run(s_main_target, elements_ref.get(), VERBOSE /*=measure*/, "montgomery", 1); - ASSERT_EQ(0, memcmp(elements_main.get(), elements_ref.get(), N * sizeof(TypeParam))); + + // // Option 1: Initialize each input vector in the batch with the same ascending values + // for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + // for (uint32_t i = 0; i < N; i++) { + // if(columns_batch){ + // in_a[idx_in_batch + batch_size * i] = TypeParam::from(i); + // } else { + // in_a[idx_in_batch * N + i] = TypeParam::from(i); + // } + // } + // } + + // // Option 2: Initialize the entire input array with ascending values + // for (int i = 0; i < total_size; i++) { + // in_a[i] = TypeParam::from(i); + // } + + // Option 3: Initialize the entire input array with random values + FieldApiTest::random_samples(in_a.get(),total_size); + + + // Reference implementation + if (!s_is_cuda_registered) { + uint64_t logn = 0; + uint64_t temp = N; + while (temp > 1) { + temp >>= 1; + logn++; + } + //BIT REVERSE FUNCTION + for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + for (uint64_t i = 0; i < N; i++) { + int rev = 0; + for (int j = 0; j < logn; ++j) { + if (i & (1 << j)) { rev |= 1 << (logn - 1 - j); } + } + if(columns_batch){ + out_ref[idx_in_batch + batch_size * i] = in_a[idx_in_batch + batch_size * rev]; + // ICICLE_LOG_DEBUG << "out_ref[" << idx_in_batch + batch_size * i << "] = in_a[" << idx_in_batch + batch_size * rev << "]"; + } else { + out_ref[idx_in_batch * N + i] = in_a[idx_in_batch * N + rev]; + // ICICLE_LOG_DEBUG << "out_ref[" << idx_in_batch * N + i << "] = in_a[" << idx_in_batch * N + rev << "]"; + } + } + } + } else { + run(s_reference_target, (is_in_place? in_a.get() : out_ref.get()), VERBOSE /*=measure*/, "bit-reverse", 1); + } + run(s_main_target, (is_in_place? in_a.get() : out_main.get()), VERBOSE /*=measure*/, "bit-reverse", 1); + + if (is_in_place) { + ASSERT_EQ(0, memcmp(in_a.get(), out_ref.get(), N * sizeof(TypeParam))); + } else { + // std::cout << "out_main:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout <(N); - auto elements_ref = std::make_unique(N); - FieldApiTest::random_samples(elements_main.get(), N); - memcpy(elements_ref.get(), elements_main.get(), N * sizeof(TypeParam)); + int seed = time(0); + srand(seed); + // ICICLE_LOG_DEBUG << "seed = " << seed; + const uint64_t size_in = 1 << (rand() % 15 + 5); + const uint64_t offset = rand() % 15; + const uint64_t stride = rand() % 4 + 1; + const uint64_t size_out = rand() % (((size_in - offset)/stride)-1) + 1; + const int batch_size = 1 << (rand() % 5); + const bool columns_batch = rand() % 2; + const int total_size_in = size_in * batch_size; + const int total_size_out = size_out * batch_size; + // ICICLE_LOG_DEBUG << "size_in = " << size_in << ", offset = " << offset << ", stride = " << stride << ", size_out = " << size_out << ", batch_size = " << batch_size << ", columns_batch = " << columns_batch; + + auto in_a = std::make_unique(total_size_in); + auto out_main = std::make_unique(total_size_out); + auto out_ref = std::make_unique(total_size_out); - auto run = [&](const std::string& dev_type, TypeParam* inout, bool measure, const char* msg, int iters) { + auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) { Device dev = {dev_type, 0}; icicle_set_device(dev); auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; std::ostringstream oss; oss << dev_type << " " << msg; - START_TIMER(BIT_REVERSE) + START_TIMER(SLICE) for (int i = 0; i < iters; ++i) { - ICICLE_CHECK(bit_reverse(inout, N, config, inout)); + ICICLE_CHECK(slice(in_a.get(), offset ,stride ,size_in , size_out , config, out)); } - END_TIMER(BIT_REVERSE, oss.str().c_str(), measure); + END_TIMER(SLICE, oss.str().c_str(), measure); }; - run(s_reference_target, elements_main.get(), VERBOSE /*=measure*/, "bit-reverse", 1); - run(s_main_target, elements_ref.get(), VERBOSE /*=measure*/, "bit-reverse", 1); - ASSERT_EQ(0, memcmp(elements_main.get(), elements_ref.get(), N * sizeof(TypeParam))); -} + // // Option 1: Initialize each input vector in the batch with the same ascending values + // for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + // for (uint32_t i = 0; i < size_in; i++) { + // if(columns_batch){ + // in_a[idx_in_batch + batch_size * i] = TypeParam::from(i); + // } else { + // in_a[idx_in_batch * size_in + i] = TypeParam::from(i); + // } + // } + // } -TYPED_TEST(FieldApiTest, Slice) + // // Option 2: Initialize the entire input array with ascending values + // for (int i = 0; i < total_size_in; i++) { + // in_a[i] = TypeParam::from(i); + // } + + // Option 3: Initialize the entire input array with random values + FieldApiTest::random_samples(in_a.get(),total_size_in); + + + // Reference implementation + if (!s_is_cuda_registered) { + for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + for (uint64_t i = 0; i < size_out; i++) { + if(columns_batch){ + out_ref[idx_in_batch + batch_size * i] = in_a[idx_in_batch + batch_size * (offset + i*stride)]; + } else { + out_ref[idx_in_batch * size_out + i] = in_a[idx_in_batch * size_in + (offset + i*stride)]; + } + } + } + } else { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "slice", 1); + } + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "slice", 1); + // std::cout << "out_main\t["; for (int i = 0; i < total_size_out-1; i++) { std::cout << out_main[i] << ", "; } std::cout <(rand() % 4); - bool columns_batch; - if (logn == 7 || logn < 4) { - columns_batch = false; // currently not supported (icicle_v3/backend/cuda/src/ntt/ntt.cuh line 578) - } else { - columns_batch = rand() % 2; + // ICICLE_LOG_DEBUG << "seed = " << seed; + const uint64_t coeffs_size = 1 << (rand() % 10 + 4); + const uint64_t domain_size = 1 << (rand() % 8 + 2); + const int batch_size = 1 << (rand() % 5); + const bool columns_batch = rand() % 2; + const int total_coeffs_size = coeffs_size * batch_size; + + auto in_coeffs = std::make_unique(total_coeffs_size); + auto in_domain = std::make_unique(domain_size); + auto out_main = std::make_unique(total_coeffs_size); + auto out_ref = std::make_unique(total_coeffs_size); + + auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) { + Device dev = {dev_type, 0}; + icicle_set_device(dev); + auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; + + std::ostringstream oss; + oss << dev_type << " " << msg; + + START_TIMER(polynomialEval) + for (int i = 0; i < iters; ++i) { + ICICLE_CHECK(polynomial_eval(in_coeffs.get(), coeffs_size, in_domain.get(), domain_size , config, out)); + } + END_TIMER(polynomialEval, oss.str().c_str(), measure); + }; + + FieldApiTest::random_samples(in_coeffs.get(), total_coeffs_size); + FieldApiTest::random_samples(in_domain.get(), domain_size); + + + // Reference implementation + // TODO - Check in comperison with GPU implementation + + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "polynomial_eval", 1); + if (s_is_cuda_registered) { + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "polynomial_eval", 1); + // std::cout << "out_main:\t["; for (int i = 0; i < total_coeffs_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout <(rand() % 2); // 0: forward, 1: inverse - const int log_coset_stride = rand() % 3; - scalar_t coset_gen; - if (log_coset_stride) { - coset_gen = scalar_t::omega(logn + log_coset_stride); - } else { - coset_gen = scalar_t::one(); + } - const int total_size = N * batch_size; - auto scalars = std::make_unique(total_size); - FieldApiTest::random_samples(scalars.get(), total_size); - auto out_main = std::make_unique(total_size); - auto out_ref = std::make_unique(total_size); - auto run = [&](const std::string& dev_type, TypeParam* out, const char* msg, bool measure, int iters) { + +TYPED_TEST(FieldApiTest, polynomialDivision) +{ + int seed = time(0); + srand(seed); + // ICICLE_LOG_DEBUG << "seed = " << seed; + // const int64_t numerator_deg = 1 << 4; + // const int64_t denumerator_deg = 1 << 2; + // const uint64_t q_size = numerator_deg - denumerator_deg + 1; + // const uint64_t r_size = numerator_deg + 1; + const int64_t numerator_deg = 3; + const int64_t denumerator_deg = 2; + const uint64_t q_size = 2; + const uint64_t r_size = 4; + const int batch_size = 1 << (rand() % 5); + const bool columns_batch = rand() % 2; + + const int64_t total_numerator_size = (numerator_deg+1) * batch_size; + const int64_t total_denumerator_size = (denumerator_deg+1) * batch_size; + const uint64_t total_q_size = q_size * batch_size; + const uint64_t total_r_size = r_size * batch_size; + + auto numerator = std::make_unique(total_numerator_size); + auto denumerator = std::make_unique(total_denumerator_size); + auto q_out_main = std::make_unique(total_q_size); + auto r_out_main = std::make_unique(total_r_size); + auto q_out_ref = std::make_unique(total_q_size); + auto r_out_ref = std::make_unique(total_r_size); + + auto run = [&](const std::string& dev_type, TypeParam* q_out, TypeParam* r_out, bool measure, const char* msg, int iters) { Device dev = {dev_type, 0}; icicle_set_device(dev); - icicleStreamHandle stream = nullptr; - ICICLE_CHECK(icicle_create_stream(&stream)); - auto init_domain_config = default_ntt_init_domain_config(); - init_domain_config.stream = stream; - init_domain_config.is_async = false; - ConfigExtension ext; - ext.set(CudaBackendConfig::CUDA_NTT_FAST_TWIDDLES_MODE, true); - init_domain_config.ext = &ext; - auto config = default_ntt_config(); - config.stream = stream; - config.coset_gen = coset_gen; - config.batch_size = batch_size; // default: 1 - config.columns_batch = columns_batch; // default: false - config.ordering = ordering; // default: kNN - config.are_inputs_on_device = true; - config.are_outputs_on_device = true; - config.is_async = false; - ICICLE_CHECK(ntt_init_domain(scalar_t::omega(log_ntt_domain_size), init_domain_config)); - TypeParam *d_in, *d_out; - ICICLE_CHECK(icicle_malloc_async((void**)&d_in, total_size * sizeof(TypeParam), config.stream)); - ICICLE_CHECK(icicle_malloc_async((void**)&d_out, total_size * sizeof(TypeParam), config.stream)); - ICICLE_CHECK(icicle_copy_to_device_async(d_in, scalars.get(), total_size * sizeof(TypeParam), config.stream)); + auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; + + std::ostringstream oss; oss << dev_type << " " << msg; - START_TIMER(NTT_sync) + + START_TIMER(polynomialDivision) for (int i = 0; i < iters; ++i) { - if (inplace) { - ICICLE_CHECK(ntt(d_in, N, dir, config, d_in)); - } else { - ICICLE_CHECK(ntt(d_in, N, dir, config, d_out)); - } + ICICLE_CHECK(polynomial_division(numerator.get(), numerator_deg, denumerator.get(), denumerator_deg , q_size, r_size, config, q_out, r_out)); } - END_TIMER(NTT_sync, oss.str().c_str(), measure); + END_TIMER(polynomialDivision, oss.str().c_str(), measure); + }; - if (inplace) { - ICICLE_CHECK(icicle_copy_to_host_async(out, d_in, total_size * sizeof(TypeParam), config.stream)); + // // Option 1: Initialize input vectors with random values + // FieldApiTest::random_samples(numerator.get(), total_numerator_size); + // FieldApiTest::random_samples(denumerator.get(), total_denumerator_size); + // // Reference implementation + // TODO - Check in comperison with GPU implementation or implement a general reference implementation + + // Option 2: Initialize the numerator and denumerator with chosen example + // And the reference implementation for the example + + for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { + if (columns_batch){ + // numerator = 3x^3+4x^2+5 + numerator[idx_in_batch + 0*batch_size] = TypeParam::from(5); + numerator[idx_in_batch + 1*batch_size] = TypeParam::from(0); + numerator[idx_in_batch + 2*batch_size] = TypeParam::from(4); + numerator[idx_in_batch + 3*batch_size] = TypeParam::from(3); + // denumerator = x^2-1 + denumerator[idx_in_batch + 0*batch_size] = TypeParam::from(0) - TypeParam::from(1); + denumerator[idx_in_batch + 1*batch_size] = TypeParam::from(0); + denumerator[idx_in_batch + 2*batch_size] = TypeParam::from(1); + if (!s_is_cuda_registered) { + // q_out_ref = 3x+4 + q_out_ref[idx_in_batch + 0*batch_size] = TypeParam::from(4); + q_out_ref[idx_in_batch + 1*batch_size] = TypeParam::from(3); + // r_out_ref = 3x+9 + r_out_ref[idx_in_batch + 0*batch_size] = TypeParam::from(9); + r_out_ref[idx_in_batch + 1*batch_size] = TypeParam::from(3); + } } else { - ICICLE_CHECK(icicle_copy_to_host_async(out, d_out, total_size * sizeof(TypeParam), config.stream)); + // numerator = 3x^3+4x^2+5 + numerator[idx_in_batch * (numerator_deg+1) + 0] = TypeParam::from(5); + numerator[idx_in_batch * (numerator_deg+1) + 1] = TypeParam::from(0); + numerator[idx_in_batch * (numerator_deg+1) + 2] = TypeParam::from(4); + numerator[idx_in_batch * (numerator_deg+1) + 3] = TypeParam::from(3); + // denumerator = x^2-1 + denumerator[idx_in_batch * (denumerator_deg+1) + 0] = TypeParam::from(0) - TypeParam::from(1); + denumerator[idx_in_batch * (denumerator_deg+1) + 1] = TypeParam::from(0); + denumerator[idx_in_batch * (denumerator_deg+1) + 2] = TypeParam::from(1); + if (!s_is_cuda_registered) { + // q_out_ref = 3x+4 + q_out_ref[idx_in_batch * q_size + 0] = TypeParam::from(4); + q_out_ref[idx_in_batch * q_size + 1] = TypeParam::from(3); + // r_out_ref = 3x+9 + r_out_ref[idx_in_batch * r_size + 0] = TypeParam::from(9); + r_out_ref[idx_in_batch * r_size + 1] = TypeParam::from(3); + } } - ICICLE_CHECK(icicle_free_async(d_in, config.stream)); - ICICLE_CHECK(icicle_free_async(d_out, config.stream)); - ICICLE_CHECK(icicle_stream_synchronize(config.stream)); - ICICLE_CHECK(icicle_destroy_stream(stream)); - ICICLE_CHECK(ntt_release_domain()); - }; - run(s_main_target, out_main.get(), "ntt", false /*=measure*/, 1 /*=iters*/); // warmup - run(s_reference_target, out_ref.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/); - run(s_main_target, out_main.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t))); + } + + if (s_is_cuda_registered) { + run(s_reference_target, q_out_ref.get(), r_out_ref.get(), VERBOSE /*=measure*/, "polynomial_division", 1); + } + // std::cout << "numerator:\t["; for (int i = 0; i < total_numerator_size-1; i++) { std::cout << numerator[i] << ", "; } std::cout <(rand() % 4); +// // bool columns_batch; +// // if (logn == 7 || logn < 4) { +// // columns_batch = false; // currently not supported (icicle_v3/backend/cuda/src/ntt/ntt.cuh line 578) +// // } else { +// // // columns_batch = true; +// // columns_batch = rand() % 2; +// // } +// // // const NTTDir dir = static_cast(rand() % 2); // 0: forward, 1: inverse +// // const NTTDir dir = static_cast(0); // 0: forward, 1: inverse +// // const int log_coset_stride = rand() % 3; +// // scalar_t coset_gen; +// // if (log_coset_stride) { +// // coset_gen = scalar_t::omega(logn + log_coset_stride); +// // } else { +// // coset_gen = scalar_t::one(); +// // } + +// const bool inplace = false; +// const int logn = 15; +// const uint64_t N = 1 << logn; +// const int log_ntt_domain_size = logn; +// const int log_batch_size = 0; +// const int batch_size = 1 << log_batch_size; +// const Ordering ordering = static_cast(0); +// bool columns_batch = false; +// const NTTDir dir = static_cast(0); // 0: forward, 1: inverse +// const int log_coset_stride = 0; +// scalar_t coset_gen; +// if (log_coset_stride) { +// coset_gen = scalar_t::omega(logn + log_coset_stride); +// } else { +// coset_gen = scalar_t::one(); +// } + +// // TODO SHANIE : remove +// // ICICLE_LOG_INFO << "NTT test: seed=" << seed; +// // ICICLE_LOG_INFO << "NTT test: omega=" << scalar_t::omega(logn); +// // ICICLE_LOG_INFO << "NTT test:s inplace=" << inplace; +// ICICLE_LOG_INFO << "NTT test: logn=" << logn; +// // ICICLE_LOG_INFO << "NTT test: log_ntt_domain_size=" << log_ntt_domain_size; +// // ICICLE_LOG_INFO << "NTT test: log_batch_size=" << log_batch_size; +// // ICICLE_LOG_INFO << "NTT test: columns_batch=" << columns_batch; +// // ICICLE_LOG_INFO << "NTT test: ordering=" << int(ordering); +// ICICLE_LOG_INFO << "NTT test: dir=" << (dir == NTTDir::kForward ? "forward" : "inverse"); +// ICICLE_LOG_INFO << "NTT test: log_coset_stride=" << log_coset_stride; +// ICICLE_LOG_INFO << "NTT test: coset_gen=" << coset_gen; + + + +// const int total_size = N * batch_size; +// auto scalars = std::make_unique(total_size); +// FieldApiTest::random_samples(scalars.get(), total_size); +// // for (int i = 0; i < total_size; i++) { scalars[i] = scalar_t::from(i); } //FIXME SHANIE: remove +// auto out_main = std::make_unique(total_size); +// auto out_ref = std::make_unique(total_size); +// auto run = [&](const std::string& dev_type, TypeParam* out, const char* msg, bool measure, int iters) { +// Device dev = {dev_type, 0}; +// icicle_set_device(dev); +// icicleStreamHandle stream = nullptr; +// ICICLE_CHECK(icicle_create_stream(&stream)); +// auto init_domain_config = default_ntt_init_domain_config(); +// init_domain_config.stream = stream; +// init_domain_config.is_async = false; +// ConfigExtension ext; +// ext.set(CudaBackendConfig::CUDA_NTT_FAST_TWIDDLES_MODE, true); +// init_domain_config.ext = &ext; +// auto config = default_ntt_config(); +// config.stream = stream; +// config.coset_gen = coset_gen; +// config.batch_size = batch_size; // default: 1 +// config.columns_batch = columns_batch; // default: false +// config.ordering = ordering; // default: kNN +// config.are_inputs_on_device = true; +// config.are_outputs_on_device = true; +// config.is_async = false; +// ICICLE_CHECK(ntt_init_domain(scalar_t::omega(log_ntt_domain_size), init_domain_config)); +// TypeParam *d_in, *d_out; +// ICICLE_CHECK(icicle_malloc_async((void**)&d_in, total_size * sizeof(TypeParam), config.stream)); +// ICICLE_CHECK(icicle_malloc_async((void**)&d_out, total_size * sizeof(TypeParam), config.stream)); +// ICICLE_CHECK(icicle_copy_to_device_async(d_in, scalars.get(), total_size * sizeof(TypeParam), config.stream)); +// std::ostringstream oss; +// oss << dev_type << " " << msg; +// START_TIMER(NTT_sync) +// for (int i = 0; i < iters; ++i) { +// if (inplace) { +// ICICLE_CHECK(ntt(d_in, N, dir, config, d_in)); +// } else { +// ICICLE_CHECK(ntt(d_in, N, dir, config, d_out)); +// } +// } +// END_TIMER(NTT_sync, oss.str().c_str(), measure); + +// if (inplace) { +// ICICLE_CHECK(icicle_copy_to_host_async(out, d_in, total_size * sizeof(TypeParam), config.stream)); +// } else { +// ICICLE_CHECK(icicle_copy_to_host_async(out, d_out, total_size * sizeof(TypeParam), config.stream)); +// } +// ICICLE_CHECK(icicle_free_async(d_in, config.stream)); +// ICICLE_CHECK(icicle_free_async(d_out, config.stream)); +// ICICLE_CHECK(icicle_stream_synchronize(config.stream)); +// ICICLE_CHECK(icicle_destroy_stream(stream)); +// ICICLE_CHECK(ntt_release_domain()); +// }; +// // run(s_main_target, out_main.get(), "ntt", false /*=measure*/, 0 /*=iters*/); // warmup +// run(s_reference_target, out_ref.get(), "V3ntt", VERBOSE /*=measure*/, 10 /*=iters*/); +// run(s_main_target, out_main.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/); +// // std::cout << "left:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout < Date: Sun, 13 Oct 2024 18:01:09 +0300 Subject: [PATCH 09/43] vecops with batch - documentation --- docs/docs/icicle/golang-bindings/vec-ops.md | 10 ++++- docs/docs/icicle/primitives/vec_ops.md | 45 +++++++++++++++---- docs/docs/icicle/programmers_guide/general.md | 5 +++ docs/docs/icicle/rust-bindings/vec-ops.md | 13 ++++-- 4 files changed, 58 insertions(+), 15 deletions(-) diff --git a/docs/docs/icicle/golang-bindings/vec-ops.md b/docs/docs/icicle/golang-bindings/vec-ops.md index e93d9a0a2..e219ec26d 100644 --- a/docs/docs/icicle/golang-bindings/vec-ops.md +++ b/docs/docs/icicle/golang-bindings/vec-ops.md @@ -4,8 +4,8 @@ Icicle exposes a number of vector operations which a user can use: -* The VecOps API provides efficient vector operations such as addition, subtraction, and multiplication. -* MatrixTranspose API allows a user to perform a transpose on a vector representation of a matrix +* The VecOps API provides efficient vector operations such as addition, subtraction, and multiplication, supporting both single and batched operations. +* MatrixTranspose API allows a user to perform a transpose on a vector representation of a matrix, with support for batched transpositions. ## VecOps API Documentation @@ -121,6 +121,8 @@ type VecOpsConfig struct { isBOnDevice bool isResultOnDevice bool IsAsync bool + batch_size int + columns_batch bool Ext config_extension.ConfigExtensionHandler } ``` @@ -132,6 +134,8 @@ type VecOpsConfig struct { - **`isBOnDevice`**: Indicates if vector `b` is located on the device. - **`isResultOnDevice`**: Specifies where the result vector should be stored (device or host memory). - **`IsAsync`**: Controls whether the vector operation runs asynchronously. +- **`batch_size`**: Number of vectors (or operations) to process in a batch. Each vector operation will be performed independently on each batch element. +- **`columns_batch`**: true if the batched vectors are stored as columns in a 2D array (i.e., the vectors are strided in memory as columns of a matrix). If false, the batched vectors are stored contiguously in memory (e.g., as rows or in a flat array). - **`Ext`**: Extended configuration for backend. #### Default Configuration @@ -148,6 +152,8 @@ This section describes the functionality of the `TransposeMatrix` function used The function takes a matrix represented as a 1D slice and transposes it, storing the result in another 1D slice. +If VecOpsConfig specifies a batch_size greater than one, the transposition is performed on multiple matrices simultaneously, producing corresponding transposed matrices. The storage arrangement of batched matrices is determined by the columns_batch field in the VecOpsConfig. + ### Function ```go diff --git a/docs/docs/icicle/primitives/vec_ops.md b/docs/docs/icicle/primitives/vec_ops.md index e9e10c1a9..7f546dc16 100644 --- a/docs/docs/icicle/primitives/vec_ops.md +++ b/docs/docs/icicle/primitives/vec_ops.md @@ -16,6 +16,8 @@ The `VecOpsConfig` struct is a configuration object used to specify parameters f - **`is_b_on_device: bool`**: Indicates whether the second input vector (`b`) is already on the device. If `false`, the vector will be copied from the host to the device. This field is optional. - **`is_result_on_device: bool`**: Indicates whether the result should be stored on the device. If `false`, the result will be transferred back to the host. - **`is_async: bool`**: Specifies whether the vector operation should be performed asynchronously. When `true`, the operation will not block the CPU, allowing other operations to proceed concurrently. Asynchronous execution requires careful synchronization to ensure data integrity. +- **`batch_size: int`**: Number of vectors (or operations) to process in a batch. Each vector operation will be performed independently on each batch element. +- **`columns_batch: bool`**: True if the batched vectors are stored as columns in a 2D array (i.e., the vectors are strided in memory as columns of a matrix). If false, the batched vectors are stored contiguously in memory (e.g., as rows or in a flat array). - **`ext: ConfigExtension*`**: Backend-specific extensions. #### Default Configuration @@ -28,6 +30,9 @@ static VecOpsConfig default_vec_ops_config() { false, // is_b_on_device false, // is_result_on_device false, // is_async + 1, // batch_size + false, // columns_batch + nullptr // ext }; return config; } @@ -35,7 +40,7 @@ static VecOpsConfig default_vec_ops_config() { ### Element-wise Operations -These functions perform element-wise operations on two input vectors `a` and `b`, producing an output vector. +These functions perform element-wise operations on two input vectors a and b. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple pairs of vectors simultaneously, producing corresponding output vectors. #### `vector_add` @@ -90,9 +95,31 @@ template eIcicleError convert_montgomery(const T* input, uint64_t size, bool is_into, const VecOpsConfig& config, T* output); ``` +### Reduction operations + +These functions perform reduction operations on vectors. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple vectors simultaneously, producing corresponding output values. The storage arrangement of batched vectors is determined by the columns_batch field in the VecOpsConfig. + +#### `vector_sum` + +Computes the sum of all elements in each vector in a batch. + +```cpp +template +eIcicleError vector_sum(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output); +``` + +#### `vector_product` + +Computes the product of all elements in each vector in a batch. + +```cpp +template +eIcicleError vector_product(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output); +``` + ### Scalar-Vector Operations -These functions apply a scalar operation to each element of a vector. +These functions apply a scalar operation to each element of a vector. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple vector-scalar pairs simultaneously, producing corresponding output vectors. #### `scalar_add_vec / scalar_sub_vec` @@ -123,7 +150,7 @@ eIcicleError scalar_mul_vec(const T* scalar_a, const T* vec_b, uint64_t size, co ### Matrix Operations -These functions perform operations on matrices. +These functions perform operations on matrices. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple matrices simultaneously, producing corresponding output matrices. #### `matrix_transpose` @@ -138,7 +165,7 @@ eIcicleError matrix_transpose(const T* mat_in, uint32_t nof_rows, uint32_t nof_c #### `bit_reverse` -Reorders the vector elements based on a bit-reversal pattern. +Reorders the vector elements based on a bit-reversal pattern. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously. ```cpp template @@ -147,16 +174,16 @@ eIcicleError bit_reverse(const T* vec_in, uint64_t size, const VecOpsConfig& con #### `slice` -Extracts a slice from a vector. +Extracts a slice from a vector. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously, producing corresponding output vectors. ```cpp template -eIcicleError slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size, const VecOpsConfig& config, T* vec_out); +eIcicleError slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size_in, uint64_t size_out, const VecOpsConfig& config, T* vec_out); ``` #### `highest_non_zero_idx` -Finds the highest non-zero index in a vector. +Finds the highest non-zero index in a vector. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously. ```cpp template @@ -165,7 +192,7 @@ eIcicleError highest_non_zero_idx(const T* vec_in, uint64_t size, const VecOpsCo #### `polynomial_eval` -Evaluates a polynomial at given domain points. +Evaluates a polynomial at given domain points. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously. ```cpp template @@ -174,7 +201,7 @@ eIcicleError polynomial_eval(const T* coeffs, uint64_t coeffs_size, const T* dom #### `polynomial_division` -Divides two polynomials. +Divides two polynomials. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously. ```cpp template diff --git a/docs/docs/icicle/programmers_guide/general.md b/docs/docs/icicle/programmers_guide/general.md index b02cd2f9c..0bef2b850 100644 --- a/docs/docs/icicle/programmers_guide/general.md +++ b/docs/docs/icicle/programmers_guide/general.md @@ -21,6 +21,7 @@ The configuration struct allows users to modify settings such as: - Specifying whether inputs and outputs are on the host or device. - Adjusting the data layout for specific optimizations. +- Setting batching parameters (batch_size and columns_batch) to perform operations on multiple data sets simultaneously. - Passing custom options to the backend implementation through an extension mechanism, such as setting the number of CPU cores to use. ### Example (C++) @@ -31,6 +32,8 @@ The configuration struct allows users to modify settings such as: // Create config struct for vector add VecOpsConfig config = default_vec_ops_config(); // optionally modify the config struct here +config.batch_size = 4; // Process 4 vector operations in a batch +config.columns_batch = true; // Batched vectors are stored as columns // Call the API eIcicleError err = vector_add(vec_a, vec_b, size, config, vec_res); @@ -45,6 +48,8 @@ struct VecOpsConfig { bool is_b_on_device; /**< True if `b` is on the device, false if it is not. Default value: false. OPTIONAL. */ bool is_result_on_device; /**< If true, the output is preserved on the device, otherwise on the host. Default value: false. */ bool is_async; /**< Whether to run the vector operations asynchronously. */ + int batch_size; /**< Number of vector operations to process in a batch. Default value: 1. */ + bool columns_batch; /**< True if batched vectors are stored as columns; false if stored contiguously. Default value: false. */ ConfigExtension* ext = nullptr; /**< Backend-specific extension. */ }; ``` diff --git a/docs/docs/icicle/rust-bindings/vec-ops.md b/docs/docs/icicle/rust-bindings/vec-ops.md index 61aa71570..c42caafb5 100644 --- a/docs/docs/icicle/rust-bindings/vec-ops.md +++ b/docs/docs/icicle/rust-bindings/vec-ops.md @@ -1,10 +1,10 @@ # Vector Operations API -Our vector operations API includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory. +Our vector operations API includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory, as well as batched operations. ## Vector Operations Configuration -The `VecOpsConfig` struct encapsulates the settings for vector operations, including device context and operation modes. +The `VecOpsConfig` struct encapsulates the settings for vector operations, including device context, operation modes, and batching parameters. ### `VecOpsConfig` @@ -17,6 +17,8 @@ pub struct VecOpsConfig { pub is_b_on_device: bool, pub is_result_on_device: bool, pub is_async: bool, + pub batch_size: usize, + pub columns_batch: bool, pub ext: ConfigExtension, } ``` @@ -28,6 +30,9 @@ pub struct VecOpsConfig { - **`is_b_on_device: bool`**: Indicates whether the input b data has been preloaded on the device memory. If `false` inputs will be copied from host to device. - **`is_result_on_device: bool`**: Indicates whether the output data is preloaded in device memory. If `false` outputs will be copied from host to device. - **`is_async: bool`**: Specifies whether the NTT operation should be performed asynchronously. +- **`batch_size: usize`**: Number of vector operations to process in a single batch. Each operation will be performed independently on each batch element. +- **`columns_batch: bool`**: true if the batched vectors are stored as columns in a 2D array (i.e., the vectors are strided in memory as columns of a matrix). If false, the batched vectors are stored contiguously in memory (e.g., as rows or in a flat array). + - **`ext: ConfigExtension`**: extended configuration for backend. ### Default Configuration @@ -40,11 +45,11 @@ let cfg = VecOpsConfig::default(); ## Vector Operations -Vector operations are implemented through the `VecOps` trait, providing methods for addition, subtraction, and multiplication of vectors. +Vector operations are implemented through the `VecOps` trait, providing methods for addition, subtraction, and multiplication of vectors. These methods support both single and batched operations based on the batch_size and columns_batch configurations. ### Methods -All operations are element-wise operations, and the results placed into the `result` param. These operations are not in place. +All operations are element-wise operations, and the results placed into the `result` param. These operations are not in place, except for accumulate. - **`add`**: Computes the element-wise sum of two vectors. - **`accumulate`**: Sum input b to a inplace. From 3a943a59fe86504b8558ed36ce5ef1c556360970 Mon Sep 17 00:00:00 2001 From: Shanie Winitz Date: Sun, 13 Oct 2024 18:04:33 +0300 Subject: [PATCH 10/43] formating --- icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 368 ++++++++++-------- icicle/include/icicle/api/babybear.h | 63 ++- icicle/include/icicle/api/bls12_377.h | 83 ++-- icicle/include/icicle/api/bls12_381.h | 83 ++-- icicle/include/icicle/api/bn254.h | 70 ++-- icicle/include/icicle/api/bw6_761.h | 74 ++-- icicle/include/icicle/api/grumpkin.h | 37 +- icicle/include/icicle/api/stark252.h | 29 +- .../include/icicle/backend/vec_ops_backend.h | 33 +- .../include/icicle/fields/complex_extension.h | 26 +- .../include/icicle/fields/quartic_extension.h | 8 +- .../default_backend/default_poly_backend.h | 14 +- icicle/include/icicle/utils/modifiers.h | 2 +- icicle/include/icicle/vec_ops.h | 149 +++---- icicle/src/vec_ops.cpp | 64 ++- icicle/tests/test_field_api.cpp | 360 +++++++++-------- 16 files changed, 878 insertions(+), 585 deletions(-) diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index a56cdc73c..74678fc83 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -51,7 +51,13 @@ class VectorOpTask : public TaskBase VectorOpTask() : TaskBase() {} // Set the operands to execute a task of 2 operands and 1 output and dispatch the task - void send_2ops_task(VecOperation operation, const uint32_t nof_operations, const T* op_a, const T* op_b, const uint32_t stride , T* output) + void send_2ops_task( + VecOperation operation, + const uint32_t nof_operations, + const T* op_a, + const T* op_b, + const uint32_t stride, + T* output) { m_operation = operation; m_nof_operations = nof_operations; @@ -72,7 +78,8 @@ class VectorOpTask : public TaskBase dispatch(); } // Set the operands to execute a task of 1 operand and dispatch the task - void send_intermidiate_res_task(VecOperation operation, const uint64_t stop_index, const T* op_a, const uint64_t stride) + void + send_intermidiate_res_task(VecOperation operation, const uint64_t stop_index, const T* op_a, const uint64_t stride) { m_operation = operation; m_stop_index = stop_index; @@ -83,7 +90,13 @@ class VectorOpTask : public TaskBase // Set the operands for bit_reverse operation and dispatch the task void send_bit_reverse_task( - VecOperation operation, uint32_t bit_size, uint64_t start_index, const uint32_t nof_operations, const T* op_a, const uint64_t stride, T* output) + VecOperation operation, + uint32_t bit_size, + uint64_t start_index, + const uint32_t nof_operations, + const T* op_a, + const uint64_t stride, + T* output) { m_operation = operation; m_bit_size = bit_size; @@ -96,7 +109,13 @@ class VectorOpTask : public TaskBase } // Set the operands for slice operation and dispatch the task - void send_slice_task(VecOperation operation, uint64_t stride, uint64_t stride_out, const uint32_t nof_operations, const T* op_a, T* output) + void send_slice_task( + VecOperation operation, + uint64_t stride, + uint64_t stride_out, + const uint32_t nof_operations, + const T* op_a, + T* output) { m_operation = operation; m_nof_operations = nof_operations; @@ -108,13 +127,22 @@ class VectorOpTask : public TaskBase } // Set the operands for replace_elements operation and dispatch the task - void send_replace_elements_task(VecOperation operation, const T* mat_in, const uint32_t nof_operations, std::vector& start_indices_in_mat, uint64_t start_index, uint32_t log_nof_rows, uint32_t log_nof_cols, const uint32_t stride, T* mat_out) + void send_replace_elements_task( + VecOperation operation, + const T* mat_in, + const uint32_t nof_operations, + std::vector& start_indices_in_mat, + uint64_t start_index, + uint32_t log_nof_rows, + uint32_t log_nof_cols, + const uint32_t stride, + T* mat_out) { m_operation = operation; m_op_a = mat_in; m_nof_operations = nof_operations; m_start_indices_in_mat = &start_indices_in_mat; - m_start_index = start_index; //start index in start_indices vector + m_start_index = start_index; // start index in start_indices vector m_log_nof_rows = log_nof_rows; m_log_nof_cols = log_nof_cols; m_stride = stride; @@ -122,22 +150,27 @@ class VectorOpTask : public TaskBase dispatch(); } - void send_out_of_place_matrix_transpose_task(VecOperation operation, const T* mat_in, const uint32_t nof_operations, const uint32_t nof_rows, const uint32_t nof_cols, const uint32_t stride, T* mat_out) - { - m_operation = operation; - m_op_a = mat_in; - m_nof_operations = nof_operations; - m_nof_rows = nof_rows; - m_nof_cols = nof_cols; - m_stride = stride; - m_output = mat_out; - dispatch(); - } + void send_out_of_place_matrix_transpose_task( + VecOperation operation, + const T* mat_in, + const uint32_t nof_operations, + const uint32_t nof_rows, + const uint32_t nof_cols, + const uint32_t stride, + T* mat_out) + { + m_operation = operation; + m_op_a = mat_in; + m_nof_operations = nof_operations; + m_nof_rows = nof_rows; + m_nof_cols = nof_cols; + m_stride = stride; + m_output = mat_out; + dispatch(); + } // Execute the selected function based on m_operation - virtual void execute() { - (this->*functionPtrs[static_cast(m_operation)])(); - } + virtual void execute() { (this->*functionPtrs[static_cast(m_operation)])(); } private: // Single worker functionality to execute vector add (+) @@ -238,10 +271,10 @@ class VectorOpTask : public TaskBase if (m_output == m_op_a) { // inplace calculation if (rev_idx < idx) { // only on of the threads need to work - std::swap(m_output[m_stride*idx], m_output[m_stride*rev_idx]); + std::swap(m_output[m_stride * idx], m_output[m_stride * rev_idx]); } - } else { // out of place calculation - m_output[m_stride*idx] = m_op_a[m_stride*rev_idx]; // set index value + } else { // out of place calculation + m_output[m_stride * idx] = m_op_a[m_stride * rev_idx]; // set index value } } } @@ -255,7 +288,8 @@ class VectorOpTask : public TaskBase } // Function to perform modulus with Mersenne number - uint64_t mersenne_mod(uint64_t shifted_idx, uint32_t total_bits) { + uint64_t mersenne_mod(uint64_t shifted_idx, uint32_t total_bits) + { uint64_t mod = (1ULL << total_bits) - 1; shifted_idx = (shifted_idx & mod) + (shifted_idx >> total_bits); while (shifted_idx >= mod) { @@ -264,7 +298,6 @@ class VectorOpTask : public TaskBase return shifted_idx; } - // Single worker functionality to execute replace elements void replace_elements() { @@ -272,7 +305,7 @@ class VectorOpTask : public TaskBase for (uint32_t i = 0; i < m_nof_operations; ++i) { uint64_t start_idx = (*m_start_indices_in_mat)[m_start_index + i]; uint64_t idx = start_idx; - T prev = m_op_a[m_stride * idx]; + T prev = m_op_a[m_stride * idx]; do { uint64_t shifted_idx = idx << m_log_nof_rows; uint64_t new_idx = mersenne_mod(shifted_idx, total_bits); @@ -294,8 +327,6 @@ class VectorOpTask : public TaskBase } } - - // An array of available function pointers arranged according to the VecOperation enum using FunctionPtr = void (VectorOpTask::*)(); static constexpr std::array(NOF_OPERATIONS)> functionPtrs = { @@ -315,28 +346,28 @@ class VectorOpTask : public TaskBase &VectorOpTask::replace_elements, // REPLACE_ELEMENTS &VectorOpTask::out_of_place_transpose // OUT_OF_PLACE_MATRIX_TRANSPOSE - }; - VecOperation m_operation; // the operation to execute - uint32_t m_nof_operations; // number of operations to execute for this task - const T* m_op_a; // pointer to operand A. Operand A is a vector, or metrix in case of replace_elements - const T* m_op_b; // pointer to operand B. Operand B is a vector or scalar - uint64_t m_start_index; // index used in bitreverse operation and out of place matrix transpose - uint64_t m_stop_index; // index used in reduce operations and out of place matrix transpose - uint32_t m_bit_size; // use in bitrev operation - uint64_t m_stride; // used to support column batch operations - uint64_t m_stride_out; // used in slice operation - T* m_output; // pointer to the output. Can be a vector, scalar pointer, or a matrix pointer in case of replace_elements - uint32_t m_log_nof_rows; // log of the number of rows in the matrix, used in replace_elements - uint32_t m_log_nof_cols; // log of the number of columns in the matrix, used in replace_elements - uint32_t m_nof_rows; // the number of rows in the matrix, used in out of place matrix transpose - uint32_t m_nof_cols; // the number of columns in the matrix, used in out of place matrix transpose + VecOperation m_operation; // the operation to execute + uint32_t m_nof_operations; // number of operations to execute for this task + const T* m_op_a; // pointer to operand A. Operand A is a vector, or metrix in case of replace_elements + const T* m_op_b; // pointer to operand B. Operand B is a vector or scalar + uint64_t m_start_index; // index used in bitreverse operation and out of place matrix transpose + uint64_t m_stop_index; // index used in reduce operations and out of place matrix transpose + uint32_t m_bit_size; // use in bitrev operation + uint64_t m_stride; // used to support column batch operations + uint64_t m_stride_out; // used in slice operation + T* + m_output; // pointer to the output. Can be a vector, scalar pointer, or a matrix pointer in case of replace_elements + uint32_t m_log_nof_rows; // log of the number of rows in the matrix, used in replace_elements + uint32_t m_log_nof_cols; // log of the number of columns in the matrix, used in replace_elements + uint32_t m_nof_rows; // the number of rows in the matrix, used in out of place matrix transpose + uint32_t m_nof_cols; // the number of columns in the matrix, used in out of place matrix transpose const std::vector* m_start_indices_in_mat; // Indices used in replace_elements operations -public: - T m_intermidiate_res; // pointer to the output. Can be a vector or scalar pointer - uint64_t m_idx_in_batch; // index in the batch. Used in intermidiate res tasks +public: + T m_intermidiate_res; // pointer to the output. Can be a vector or scalar pointer + uint64_t m_idx_in_batch; // index in the batch. Used in intermidiate res tasks }; // class VectorOpTask #define NOF_OPERATIONS_PER_TASK 512 @@ -357,10 +388,11 @@ eIcicleError cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { TasksManager> task_manager(get_nof_workers(config) - 1); - const uint64_t total_nof_operations = size*config.batch_size; + const uint64_t total_nof_operations = size * config.batch_size; for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) { VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); - task_p->send_2ops_task(op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), vec_a + i, vec_b + i, 1, output + i); + task_p->send_2ops_task( + op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), vec_a + i, vec_b + i, 1, output + i); } task_manager.wait_done(); return eIcicleError::SUCCESS; @@ -369,21 +401,27 @@ cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t size, // Execute a full task from the type vector = scalar (op) vector template eIcicleError cpu_scalar_vector_op( - VecOperation op, const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output) + VecOperation op, + const T* scalar_a, + const T* vec_b, + uint64_t size, + bool use_single_scalar, + const VecOpsConfig& config, + T* output) { TasksManager> task_manager(get_nof_workers(config) - 1); - const uint64_t total_nof_operations = use_single_scalar? size*config.batch_size : size; - const uint32_t stride = (!use_single_scalar && config.columns_batch)? config.batch_size : 1; - for (uint32_t idx_in_batch = 0; idx_in_batch < (use_single_scalar? 1 : config.batch_size); idx_in_batch++) { + const uint64_t total_nof_operations = use_single_scalar ? size * config.batch_size : size; + const uint32_t stride = (!use_single_scalar && config.columns_batch) ? config.batch_size : 1; + for (uint32_t idx_in_batch = 0; idx_in_batch < (use_single_scalar ? 1 : config.batch_size); idx_in_batch++) { for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) { VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); task_p->send_2ops_task( - op, - std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), - scalar_a + idx_in_batch, - (!use_single_scalar && config.columns_batch)? vec_b + idx_in_batch + i*config.batch_size : vec_b + idx_in_batch*size + i, - stride, - (!use_single_scalar && config.columns_batch)? output + idx_in_batch + i*config.batch_size : output + idx_in_batch*size + i); + op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), scalar_a + idx_in_batch, + (!use_single_scalar && config.columns_batch) ? vec_b + idx_in_batch + i * config.batch_size + : vec_b + idx_in_batch * size + i, + stride, + (!use_single_scalar && config.columns_batch) ? output + idx_in_batch + i * config.batch_size + : output + idx_in_batch * size + i); } } task_manager.wait_done(); @@ -394,8 +432,8 @@ eIcicleError cpu_scalar_vector_op( // Functions to register at the CPU backend /*********************************** ADD ***********************************/ template -eIcicleError -cpu_vector_add(const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) +eIcicleError cpu_vector_add( + const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { return cpu_2vectors_op(VecOperation::VECTOR_ADD, vec_a, vec_b, size, config, output); } @@ -414,8 +452,8 @@ REGISTER_VECTOR_ACCUMULATE_BACKEND("CPU", cpu_vector_accumulate); /*********************************** SUB ***********************************/ template -eIcicleError -cpu_vector_sub(const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) +eIcicleError cpu_vector_sub( + const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { return cpu_2vectors_op(VecOperation::VECTOR_SUB, vec_a, vec_b, size, config, output); } @@ -424,8 +462,8 @@ REGISTER_VECTOR_SUB_BACKEND("CPU", cpu_vector_sub); /*********************************** MUL ***********************************/ template -eIcicleError -cpu_vector_mul(const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) +eIcicleError cpu_vector_mul( + const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { return cpu_2vectors_op(VecOperation::VECTOR_MUL, vec_a, vec_b, size, config, output); } @@ -434,8 +472,8 @@ REGISTER_VECTOR_MUL_BACKEND("CPU", cpu_vector_mul); /*********************************** DIV ***********************************/ template -eIcicleError -cpu_vector_div(const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) +eIcicleError cpu_vector_div( + const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { return cpu_2vectors_op(VecOperation::VECTOR_DIV, vec_a, vec_b, size, config, output); } @@ -448,16 +486,15 @@ eIcicleError cpu_convert_montgomery( const Device& device, const T* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, T* output) { TasksManager> task_manager(get_nof_workers(config) - 1); - const uint64_t total_nof_operations = size*config.batch_size; + const uint64_t total_nof_operations = size * config.batch_size; for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) { VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); task_p->send_1op_task( - (is_to_montgomery ? CONVERT_TO_MONTGOMERY : CONVERT_FROM_MONTGOMERY), std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), - input + i, output + i); + (is_to_montgomery ? CONVERT_TO_MONTGOMERY : CONVERT_FROM_MONTGOMERY), + std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), input + i, output + i); } task_manager.wait_done(); - for (uint64_t i = 0; i < size*config.batch_size; i++) { - } + for (uint64_t i = 0; i < size * config.batch_size; i++) {} return eIcicleError::SUCCESS; } @@ -482,28 +519,28 @@ eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t size, uint64_t idx_in_batch = 0; // run until all vector deployed and all tasks completed while (true) { - VectorOpTask* task_p = vec_a_offset < size ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task(); - if (task_p == nullptr) { - return eIcicleError::SUCCESS; - } + VectorOpTask* task_p = + vec_a_offset < size ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task(); + if (task_p == nullptr) { return eIcicleError::SUCCESS; } if (task_p->is_completed()) { - output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] ? output[task_p->m_idx_in_batch] + task_p->m_intermidiate_res : task_p->m_intermidiate_res; + output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] + ? output[task_p->m_idx_in_batch] + task_p->m_intermidiate_res + : task_p->m_intermidiate_res; output_initialized[task_p->m_idx_in_batch] = true; } if (vec_a_offset < size) { task_p->m_idx_in_batch = idx_in_batch; task_p->send_intermidiate_res_task( - VecOperation::VECTOR_SUM, - std::min((uint64_t)NOF_OPERATIONS_PER_TASK , size - vec_a_offset), - config.columns_batch? vec_a + idx_in_batch + vec_a_offset*config.batch_size : vec_a + idx_in_batch*size + vec_a_offset, - config.columns_batch? config.batch_size : 1); + VecOperation::VECTOR_SUM, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - vec_a_offset), + config.columns_batch ? vec_a + idx_in_batch + vec_a_offset * config.batch_size + : vec_a + idx_in_batch * size + vec_a_offset, + config.columns_batch ? config.batch_size : 1); idx_in_batch++; if (idx_in_batch == config.batch_size) { vec_a_offset += NOF_OPERATIONS_PER_TASK; idx_in_batch = 0; } - } - else { + } else { task_p->set_idle(); } } @@ -513,7 +550,8 @@ REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum); /*********************************** PRODUCT ***********************************/ template -eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output) +eIcicleError +cpu_vector_product(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output) { TasksManager> task_manager(get_nof_workers(config) - 1); std::vector output_initialized = std::vector(config.batch_size, false); @@ -521,28 +559,28 @@ eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t s uint64_t idx_in_batch = 0; // run until all vector deployed and all tasks completed while (true) { - VectorOpTask* task_p = vec_a_offset < size ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task(); - if (task_p == nullptr) { - return eIcicleError::SUCCESS; - } + VectorOpTask* task_p = + vec_a_offset < size ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task(); + if (task_p == nullptr) { return eIcicleError::SUCCESS; } if (task_p->is_completed()) { - output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] ? output[task_p->m_idx_in_batch] * task_p->m_intermidiate_res : task_p->m_intermidiate_res; + output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] + ? output[task_p->m_idx_in_batch] * task_p->m_intermidiate_res + : task_p->m_intermidiate_res; output_initialized[task_p->m_idx_in_batch] = true; } if (vec_a_offset < size) { task_p->m_idx_in_batch = idx_in_batch; task_p->send_intermidiate_res_task( - VecOperation::VECTOR_PRODUCT, - std::min((uint64_t)NOF_OPERATIONS_PER_TASK , size - vec_a_offset), - config.columns_batch? vec_a + idx_in_batch + vec_a_offset*config.batch_size : vec_a + idx_in_batch*size + vec_a_offset, - config.columns_batch? config.batch_size : 1); + VecOperation::VECTOR_PRODUCT, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - vec_a_offset), + config.columns_batch ? vec_a + idx_in_batch + vec_a_offset * config.batch_size + : vec_a + idx_in_batch * size + vec_a_offset, + config.columns_batch ? config.batch_size : 1); idx_in_batch++; if (idx_in_batch == config.batch_size) { vec_a_offset += NOF_OPERATIONS_PER_TASK; idx_in_batch = 0; } - } - else { + } else { task_p->set_idle(); } } @@ -553,7 +591,13 @@ REGISTER_VECTOR_PRODUCT_BACKEND("CPU", cpu_vector_product); /*********************************** Scalar + Vector***********************************/ template eIcicleError cpu_scalar_add( - const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output) + const Device& device, + const T* scalar_a, + const T* vec_b, + uint64_t size, + bool use_single_scalar, + const VecOpsConfig& config, + T* output) { return cpu_scalar_vector_op(VecOperation::SCALAR_ADD_VEC, scalar_a, vec_b, size, use_single_scalar, config, output); } @@ -563,7 +607,13 @@ REGISTER_SCALAR_ADD_VEC_BACKEND("CPU", cpu_scalar_add); /*********************************** Scalar - Vector***********************************/ template eIcicleError cpu_scalar_sub( - const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output) + const Device& device, + const T* scalar_a, + const T* vec_b, + uint64_t size, + bool use_single_scalar, + const VecOpsConfig& config, + T* output) { return cpu_scalar_vector_op(VecOperation::SCALAR_SUB_VEC, scalar_a, vec_b, size, use_single_scalar, config, output); } @@ -573,7 +623,13 @@ REGISTER_SCALAR_SUB_VEC_BACKEND("CPU", cpu_scalar_sub); /*********************************** MUL BY SCALAR***********************************/ template eIcicleError cpu_scalar_mul( - const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output) + const Device& device, + const T* scalar_a, + const T* vec_b, + uint64_t size, + bool use_single_scalar, + const VecOpsConfig& config, + T* output) { return cpu_scalar_vector_op(VecOperation::SCALAR_MUL_VEC, scalar_a, vec_b, size, use_single_scalar, config, output); } @@ -587,22 +643,19 @@ eIcicleError out_of_place_matrix_transpose( const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out) { TasksManager> task_manager(get_nof_workers(config) - 1); - uint32_t stride = config.columns_batch? config.batch_size : 1; + uint32_t stride = config.columns_batch ? config.batch_size : 1; const uint64_t total_elements_one_mat = static_cast(nof_rows) * nof_cols; - const uint32_t NOF_ROWS_PER_TASK = std::min((uint64_t)nof_rows, std::max((uint64_t)(NOF_OPERATIONS_PER_TASK / nof_cols) , (uint64_t)1)); + const uint32_t NOF_ROWS_PER_TASK = + std::min((uint64_t)nof_rows, std::max((uint64_t)(NOF_OPERATIONS_PER_TASK / nof_cols), (uint64_t)1)); for (uint32_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { - const T* cur_mat_in = config.columns_batch? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements_one_mat; - T* cur_mat_out = config.columns_batch? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements_one_mat; + const T* cur_mat_in = config.columns_batch ? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements_one_mat; + T* cur_mat_out = config.columns_batch ? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements_one_mat; // Perform the matrix transpose for (uint32_t i = 0; i < nof_rows; i += NOF_ROWS_PER_TASK) { VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); task_p->send_out_of_place_matrix_transpose_task( - OUT_OF_PLACE_MATRIX_TRANSPOSE, - cur_mat_in + stride*i*nof_cols, - std::min((uint64_t)NOF_ROWS_PER_TASK, (uint64_t)nof_rows - i), - nof_rows, - nof_cols, - stride, + OUT_OF_PLACE_MATRIX_TRANSPOSE, cur_mat_in + stride * i * nof_cols, + std::min((uint64_t)NOF_ROWS_PER_TASK, (uint64_t)nof_rows - i), nof_rows, nof_cols, stride, cur_mat_out + (stride * i)); } } @@ -610,7 +663,8 @@ eIcicleError out_of_place_matrix_transpose( return eIcicleError::SUCCESS; } -uint32_t gcd(uint32_t a, uint32_t b) { +uint32_t gcd(uint32_t a, uint32_t b) +{ while (b != 0) { uint32_t temp = b; b = a % b; @@ -621,9 +675,20 @@ uint32_t gcd(uint32_t a, uint32_t b) { // Recursive function to generate all k-ary necklaces and to replace the elements withing the necklaces template -void gen_necklace(uint32_t t, uint32_t p, uint32_t k, uint32_t length, std::vector& necklace, std::vector& task_indices) { +void gen_necklace( + uint32_t t, + uint32_t p, + uint32_t k, + uint32_t length, + std::vector& necklace, + std::vector& task_indices) +{ if (t > length) { - if (length % p == 0 && !std::all_of(necklace.begin() + 1, necklace.begin() + length + 1,[first_element = necklace[1]](uint32_t x) { return x == first_element; })) { + if ( + length % p == 0 && + !std::all_of(necklace.begin() + 1, necklace.begin() + length + 1, [first_element = necklace[1]](uint32_t x) { + return x == first_element; + })) { uint32_t start_idx = 0; uint64_t multiplier = 1; for (int i = length; i >= 1; --i) { // Compute start_idx as the decimal representation of the necklace @@ -645,17 +710,21 @@ void gen_necklace(uint32_t t, uint32_t p, uint32_t k, uint32_t length, std::vect } template -eIcicleError matrix_transpose_necklaces(const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out){ +eIcicleError matrix_transpose_necklaces( + const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out) +{ uint32_t log_nof_rows = static_cast(std::floor(std::log2(nof_rows))); uint32_t log_nof_cols = static_cast(std::floor(std::log2(nof_cols))); uint32_t gcd_value = gcd(log_nof_rows, log_nof_cols); uint32_t k = 1 << gcd_value; // Base of necklaces - uint32_t length = (log_nof_cols + log_nof_rows) / gcd_value; // length of necklaces. Since all are powers of 2, equvalent to (log_nof_cols + log_nof_rows) / gcd_value; + uint32_t length = + (log_nof_cols + log_nof_rows) / gcd_value; // length of necklaces. Since all are powers of 2, equvalent to + // (log_nof_cols + log_nof_rows) / gcd_value; const uint64_t max_nof_operations = NOF_OPERATIONS_PER_TASK / length; const uint64_t total_elements_one_mat = static_cast(nof_rows) * nof_cols; std::vector necklace(length + 1, 0); - std::vector start_indices_in_mat; // Collect start indices + std::vector start_indices_in_mat; // Collect start indices gen_necklace(1, 1, k, length, necklace, start_indices_in_mat); TasksManager> task_manager(get_nof_workers(config) - 1); @@ -664,22 +733,16 @@ eIcicleError matrix_transpose_necklaces(const T* mat_in, uint32_t nof_rows, uint for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); task_p->send_replace_elements_task( - REPLACE_ELEMENTS, - config.columns_batch? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements_one_mat, - nof_operations, - start_indices_in_mat, - i, - log_nof_rows, - log_nof_cols, - config.columns_batch? config.batch_size : 1, - config.columns_batch? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements_one_mat); + REPLACE_ELEMENTS, config.columns_batch ? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements_one_mat, + nof_operations, start_indices_in_mat, i, log_nof_rows, log_nof_cols, + config.columns_batch ? config.batch_size : 1, + config.columns_batch ? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements_one_mat); } } task_manager.wait_done(); return eIcicleError::SUCCESS; } - template eIcicleError cpu_matrix_transpose( const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out) @@ -690,7 +753,7 @@ eIcicleError cpu_matrix_transpose( bool is_power_of_2 = (nof_rows & (nof_rows - 1)) == 0 && (nof_cols & (nof_cols - 1)) == 0; bool is_inplace = mat_in == mat_out; if (!is_inplace) { - return(out_of_place_matrix_transpose(device, mat_in, nof_rows, nof_cols, config, mat_out)); + return (out_of_place_matrix_transpose(device, mat_in, nof_rows, nof_cols, config, mat_out)); } else if (is_power_of_2) { return (matrix_transpose_necklaces(mat_in, nof_rows, nof_cols, config, mat_out)); } else { @@ -721,13 +784,10 @@ cpu_bit_reverse(const Device& device, const T* vec_in, uint64_t size, const VecO VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); task_p->send_bit_reverse_task( - BIT_REVERSE, - logn, - i, - std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - i), - config.columns_batch? vec_in + idx_in_batch : vec_in + idx_in_batch*size, - config.columns_batch? config.batch_size : 1, - config.columns_batch? vec_out + idx_in_batch: vec_out + idx_in_batch*size); + BIT_REVERSE, logn, i, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - i), + config.columns_batch ? vec_in + idx_in_batch : vec_in + idx_in_batch * size, + config.columns_batch ? config.batch_size : 1, + config.columns_batch ? vec_out + idx_in_batch : vec_out + idx_in_batch * size); } } task_manager.wait_done(); @@ -752,21 +812,19 @@ eIcicleError cpu_slice( const VecOpsConfig& config, T* vec_out) { - ICICLE_ASSERT(vec_in != nullptr && vec_out != nullptr) << "Error: Invalid argument - input or output vector is null"; - ICICLE_ASSERT(offset + (size_out-1) * stride < size_in) << "Error: Invalid argument - slice out of bound"; + ICICLE_ASSERT(offset + (size_out - 1) * stride < size_in) << "Error: Invalid argument - slice out of bound"; TasksManager> task_manager(get_nof_workers(config) - 1); for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { for (uint64_t i = 0; i < size_out; i += NOF_OPERATIONS_PER_TASK) { VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); task_p->send_slice_task( - SLICE, - config.columns_batch? stride*config.batch_size : stride, - config.columns_batch? config.batch_size : 1, + SLICE, config.columns_batch ? stride * config.batch_size : stride, config.columns_batch ? config.batch_size : 1, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size_out - i), - config.columns_batch? vec_in + idx_in_batch + (offset + i * stride)*config.batch_size : vec_in + idx_in_batch*size_in + offset + i * stride, - config.columns_batch? vec_out + idx_in_batch + i*config.batch_size : vec_out + idx_in_batch*size_out + i); + config.columns_batch ? vec_in + idx_in_batch + (offset + i * stride) * config.batch_size + : vec_in + idx_in_batch * size_in + offset + i * stride, + config.columns_batch ? vec_out + idx_in_batch + i * config.batch_size : vec_out + idx_in_batch * size_out + i); } } task_manager.wait_done(); @@ -783,11 +841,12 @@ template eIcicleError cpu_highest_non_zero_idx( const Device& device, const T* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx /*OUT*/) { - ICICLE_ASSERT(input && out_idx && size !=0) << "Error: Invalid argument"; - uint64_t stride = config.columns_batch? config.batch_size : 1; + ICICLE_ASSERT(input && out_idx && size != 0) << "Error: Invalid argument"; + uint64_t stride = config.columns_batch ? config.batch_size : 1; for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) { out_idx[idx_in_batch] = -1; // zero vector is considered '-1' since 0 would be zero in vec[0] - const T* curr_input = config.columns_batch? input + idx_in_batch : input + idx_in_batch * size; // Pointer to the current vector + const T* curr_input = + config.columns_batch ? input + idx_in_batch : input + idx_in_batch * size; // Pointer to the current vector for (int64_t i = size - 1; i >= 0; --i) { if (curr_input[i * stride] != T::zero()) { out_idx[idx_in_batch] = i; @@ -800,7 +859,6 @@ eIcicleError cpu_highest_non_zero_idx( REGISTER_HIGHEST_NON_ZERO_IDX_BACKEND("CPU", cpu_highest_non_zero_idx); - /*********************************** Polynomial evaluation ***********************************/ template @@ -818,12 +876,13 @@ eIcicleError cpu_poly_eval( // example: ax^2+bx+c is computed as (1) r=a, (2) r=r*x+b, (3) r=r*x+c uint64_t stride = config.columns_batch ? config.batch_size : 1; for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) { - const T* curr_coeffs = config.columns_batch? coeffs + idx_in_batch : coeffs + idx_in_batch * coeffs_size; - T* curr_evals = config.columns_batch? evals + idx_in_batch : evals + idx_in_batch * domain_size; + const T* curr_coeffs = config.columns_batch ? coeffs + idx_in_batch : coeffs + idx_in_batch * coeffs_size; + T* curr_evals = config.columns_batch ? evals + idx_in_batch : evals + idx_in_batch * domain_size; for (uint64_t eval_idx = 0; eval_idx < domain_size; ++eval_idx) { curr_evals[eval_idx * stride] = curr_coeffs[(coeffs_size - 1) * stride]; for (int64_t coeff_idx = coeffs_size - 2; coeff_idx >= 0; --coeff_idx) { - curr_evals[eval_idx * stride] = curr_evals[eval_idx * stride] * domain[eval_idx] + curr_coeffs[coeff_idx * stride]; + curr_evals[eval_idx * stride] = + curr_evals[eval_idx * stride] * domain[eval_idx] + curr_coeffs[coeff_idx * stride]; } } } @@ -838,7 +897,7 @@ void school_book_division_step_cpu(T* r, T* q, const T* b, int deg_r, int deg_b, { int64_t monomial = deg_r - deg_b; // monomial=1 is 'x', monomial=2 is x^2 etc. - T lc_r = r[deg_r * stride]; // leading coefficient of r + T lc_r = r[deg_r * stride]; // leading coefficient of r T monomial_coeff = lc_r * lc_b_inv; // lc_r / lc_b // adding monomial s to q (q=q+s) @@ -870,22 +929,27 @@ eIcicleError cpu_poly_divide( // ICICLE_CHECK(icicle_copy_async(r_out, numerator, r_size * config.batch_size * sizeof(T), config.stream)); // copy numerator to r_out // FIXME should it be copied using icicle_copy_async? - for (uint64_t i = 0; i < (numerator_deg+1)*config.batch_size; ++i) { + for (uint64_t i = 0; i < (numerator_deg + 1) * config.batch_size; ++i) { r_out[i] = numerator[i]; } - uint32_t stride = config.columns_batch? config.batch_size : 1; + uint32_t stride = config.columns_batch ? config.batch_size : 1; auto deg_r = std::make_unique(config.batch_size); for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) { - const T* curr_denumerator = config.columns_batch? denumerator + idx_in_batch : denumerator + idx_in_batch * (denumerator_deg+1); // Pointer to the current vector - T* curr_q_out = config.columns_batch? q_out + idx_in_batch : q_out + idx_in_batch * q_size; // Pointer to the current vector - T* curr_r_out = config.columns_batch? r_out + idx_in_batch : r_out + idx_in_batch * r_size; // Pointer to the current vector + const T* curr_denumerator = config.columns_batch + ? denumerator + idx_in_batch + : denumerator + idx_in_batch * (denumerator_deg + 1); // Pointer to the current vector + T* curr_q_out = + config.columns_batch ? q_out + idx_in_batch : q_out + idx_in_batch * q_size; // Pointer to the current vector + T* curr_r_out = + config.columns_batch ? r_out + idx_in_batch : r_out + idx_in_batch * r_size; // Pointer to the current vector // invert largest coeff of b const T& lc_b_inv = T::inverse(curr_denumerator[denumerator_deg * stride]); deg_r[idx_in_batch] = numerator_deg; while (deg_r[idx_in_batch] >= denumerator_deg) { // each iteration is removing the largest monomial in r until deg(r)* config, babybear::scalar_t* output); + const babybear::scalar_t* input, + int size, + NTTDir dir, + const NTTConfig* config, + babybear::scalar_t* output); extern "C" eIcicleError babybear_ntt_release_domain(); extern "C" eIcicleError babybear_extension_ntt( - const babybear::extension_t* input, int size, NTTDir dir, const NTTConfig* config, babybear::extension_t* output); - + const babybear::extension_t* input, + int size, + NTTDir dir, + const NTTConfig* config, + babybear::extension_t* output); extern "C" void babybear_extension_generate_scalars(babybear::extension_t* scalars, int size); -extern "C" eIcicleError babybear_extension_scalar_convert_montgomery( - const babybear::extension_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, babybear::extension_t* output); +extern "C" eIcicleError babybear_extension_scalar_convert_montgomery( + const babybear::extension_t* input, + uint64_t size, + bool is_into, + const VecOpsConfig* config, + babybear::extension_t* output); extern "C" eIcicleError babybear_extension_vector_mul( - const babybear::extension_t* vec_a, const babybear::extension_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::extension_t* result); + const babybear::extension_t* vec_a, + const babybear::extension_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + babybear::extension_t* result); extern "C" eIcicleError babybear_extension_vector_add( - const babybear::extension_t* vec_a, const babybear::extension_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::extension_t* result); + const babybear::extension_t* vec_a, + const babybear::extension_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + babybear::extension_t* result); extern "C" eIcicleError babybear_extension_vector_sub( - const babybear::extension_t* vec_a, const babybear::extension_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::extension_t* result); + const babybear::extension_t* vec_a, + const babybear::extension_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + babybear::extension_t* result); extern "C" eIcicleError babybear_extension_matrix_transpose( const babybear::extension_t* input, @@ -50,15 +72,26 @@ extern "C" eIcicleError babybear_extension_matrix_transpose( extern "C" eIcicleError babybear_extension_bit_reverse( const babybear::extension_t* input, uint64_t n, const VecOpsConfig* config, babybear::extension_t* output); - extern "C" eIcicleError babybear_vector_mul( - const babybear::scalar_t* vec_a, const babybear::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* result); + const babybear::scalar_t* vec_a, + const babybear::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + babybear::scalar_t* result); extern "C" eIcicleError babybear_vector_add( - const babybear::scalar_t* vec_a, const babybear::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* result); + const babybear::scalar_t* vec_a, + const babybear::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + babybear::scalar_t* result); extern "C" eIcicleError babybear_vector_sub( - const babybear::scalar_t* vec_a, const babybear::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* result); + const babybear::scalar_t* vec_a, + const babybear::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + babybear::scalar_t* result); extern "C" eIcicleError babybear_matrix_transpose( const babybear::scalar_t* input, @@ -69,5 +102,3 @@ extern "C" eIcicleError babybear_matrix_transpose( extern "C" eIcicleError babybear_bit_reverse( const babybear::scalar_t* input, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* output); - - diff --git a/icicle/include/icicle/api/bls12_377.h b/icicle/include/icicle/api/bls12_377.h index c617dcaf9..3bbb17ef5 100644 --- a/icicle/include/icicle/api/bls12_377.h +++ b/icicle/include/icicle/api/bls12_377.h @@ -19,23 +19,35 @@ extern "C" void bls12_377_g2_generate_projective_points(bls12_377::g2_projective extern "C" void bls12_377_g2_generate_affine_points(bls12_377::g2_affine_t* points, int size); extern "C" eIcicleError bls12_377_g2_affine_convert_montgomery( - const bls12_377::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::g2_affine_t* output); + const bls12_377::g2_affine_t* input, + size_t n, + bool is_into, + const VecOpsConfig* config, + bls12_377::g2_affine_t* output); extern "C" eIcicleError bls12_377_g2_projective_convert_montgomery( - const bls12_377::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::g2_projective_t* output); + const bls12_377::g2_projective_t* input, + size_t n, + bool is_into, + const VecOpsConfig* config, + bls12_377::g2_projective_t* output); extern "C" eIcicleError bls12_377_ecntt( - const bls12_377::projective_t* input, int size, NTTDir dir, const NTTConfig* config, bls12_377::projective_t* output); - + const bls12_377::projective_t* input, + int size, + NTTDir dir, + const NTTConfig* config, + bls12_377::projective_t* output); extern "C" eIcicleError bls12_377_precompute_msm_bases( - const bls12_377::affine_t* bases, - int nof_bases, - const MSMConfig* config, - bls12_377::affine_t* output_bases); + const bls12_377::affine_t* bases, int nof_bases, const MSMConfig* config, bls12_377::affine_t* output_bases); extern "C" eIcicleError bls12_377_msm( - const bls12_377::scalar_t* scalars, const bls12_377::affine_t* points, int msm_size, const MSMConfig* config, bls12_377::projective_t* out); + const bls12_377::scalar_t* scalars, + const bls12_377::affine_t* points, + int msm_size, + const MSMConfig* config, + bls12_377::projective_t* out); extern "C" bool bls12_377_eq(bls12_377::projective_t* point1, bls12_377::projective_t* point2); @@ -49,38 +61,63 @@ extern "C" eIcicleError bls12_377_affine_convert_montgomery( const bls12_377::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::affine_t* output); extern "C" eIcicleError bls12_377_projective_convert_montgomery( - const bls12_377::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::projective_t* output); + const bls12_377::projective_t* input, + size_t n, + bool is_into, + const VecOpsConfig* config, + bls12_377::projective_t* output); extern "C" eIcicleError bls12_377_g2_precompute_msm_bases( - const bls12_377::g2_affine_t* bases, - int nof_bases, - const MSMConfig* config, - bls12_377::g2_affine_t* output_bases); + const bls12_377::g2_affine_t* bases, int nof_bases, const MSMConfig* config, bls12_377::g2_affine_t* output_bases); extern "C" eIcicleError bls12_377_g2_msm( - const bls12_377::scalar_t* scalars, const bls12_377::g2_affine_t* points, int msm_size, const MSMConfig* config, bls12_377::g2_projective_t* out); + const bls12_377::scalar_t* scalars, + const bls12_377::g2_affine_t* points, + int msm_size, + const MSMConfig* config, + bls12_377::g2_projective_t* out); extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size); extern "C" void bls12_377_scalar_convert_montgomery( - const bls12_377::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bls12_377::scalar_t* output); + const bls12_377::scalar_t* input, + uint64_t size, + bool is_into, + const VecOpsConfig* config, + bls12_377::scalar_t* output); -extern "C" eIcicleError bls12_377_ntt_init_domain( - bls12_377::scalar_t* primitive_root, const NTTInitDomainConfig* config); +extern "C" eIcicleError +bls12_377_ntt_init_domain(bls12_377::scalar_t* primitive_root, const NTTInitDomainConfig* config); extern "C" eIcicleError bls12_377_ntt( - const bls12_377::scalar_t* input, int size, NTTDir dir, const NTTConfig* config, bls12_377::scalar_t* output); + const bls12_377::scalar_t* input, + int size, + NTTDir dir, + const NTTConfig* config, + bls12_377::scalar_t* output); extern "C" eIcicleError bls12_377_ntt_release_domain(); extern "C" eIcicleError bls12_377_vector_mul( - const bls12_377::scalar_t* vec_a, const bls12_377::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bls12_377::scalar_t* result); + const bls12_377::scalar_t* vec_a, + const bls12_377::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + bls12_377::scalar_t* result); extern "C" eIcicleError bls12_377_vector_add( - const bls12_377::scalar_t* vec_a, const bls12_377::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bls12_377::scalar_t* result); + const bls12_377::scalar_t* vec_a, + const bls12_377::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + bls12_377::scalar_t* result); extern "C" eIcicleError bls12_377_vector_sub( - const bls12_377::scalar_t* vec_a, const bls12_377::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bls12_377::scalar_t* result); + const bls12_377::scalar_t* vec_a, + const bls12_377::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + bls12_377::scalar_t* result); extern "C" eIcicleError bls12_377_matrix_transpose( const bls12_377::scalar_t* input, @@ -91,5 +128,3 @@ extern "C" eIcicleError bls12_377_matrix_transpose( extern "C" eIcicleError bls12_377_bit_reverse( const bls12_377::scalar_t* input, uint64_t n, const VecOpsConfig* config, bls12_377::scalar_t* output); - - diff --git a/icicle/include/icicle/api/bls12_381.h b/icicle/include/icicle/api/bls12_381.h index 361731586..b62e6a61a 100644 --- a/icicle/include/icicle/api/bls12_381.h +++ b/icicle/include/icicle/api/bls12_381.h @@ -19,23 +19,35 @@ extern "C" void bls12_381_g2_generate_projective_points(bls12_381::g2_projective extern "C" void bls12_381_g2_generate_affine_points(bls12_381::g2_affine_t* points, int size); extern "C" eIcicleError bls12_381_g2_affine_convert_montgomery( - const bls12_381::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::g2_affine_t* output); + const bls12_381::g2_affine_t* input, + size_t n, + bool is_into, + const VecOpsConfig* config, + bls12_381::g2_affine_t* output); extern "C" eIcicleError bls12_381_g2_projective_convert_montgomery( - const bls12_381::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::g2_projective_t* output); + const bls12_381::g2_projective_t* input, + size_t n, + bool is_into, + const VecOpsConfig* config, + bls12_381::g2_projective_t* output); extern "C" eIcicleError bls12_381_ecntt( - const bls12_381::projective_t* input, int size, NTTDir dir, const NTTConfig* config, bls12_381::projective_t* output); - + const bls12_381::projective_t* input, + int size, + NTTDir dir, + const NTTConfig* config, + bls12_381::projective_t* output); extern "C" eIcicleError bls12_381_precompute_msm_bases( - const bls12_381::affine_t* bases, - int nof_bases, - const MSMConfig* config, - bls12_381::affine_t* output_bases); + const bls12_381::affine_t* bases, int nof_bases, const MSMConfig* config, bls12_381::affine_t* output_bases); extern "C" eIcicleError bls12_381_msm( - const bls12_381::scalar_t* scalars, const bls12_381::affine_t* points, int msm_size, const MSMConfig* config, bls12_381::projective_t* out); + const bls12_381::scalar_t* scalars, + const bls12_381::affine_t* points, + int msm_size, + const MSMConfig* config, + bls12_381::projective_t* out); extern "C" bool bls12_381_eq(bls12_381::projective_t* point1, bls12_381::projective_t* point2); @@ -49,38 +61,63 @@ extern "C" eIcicleError bls12_381_affine_convert_montgomery( const bls12_381::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::affine_t* output); extern "C" eIcicleError bls12_381_projective_convert_montgomery( - const bls12_381::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::projective_t* output); + const bls12_381::projective_t* input, + size_t n, + bool is_into, + const VecOpsConfig* config, + bls12_381::projective_t* output); extern "C" eIcicleError bls12_381_g2_precompute_msm_bases( - const bls12_381::g2_affine_t* bases, - int nof_bases, - const MSMConfig* config, - bls12_381::g2_affine_t* output_bases); + const bls12_381::g2_affine_t* bases, int nof_bases, const MSMConfig* config, bls12_381::g2_affine_t* output_bases); extern "C" eIcicleError bls12_381_g2_msm( - const bls12_381::scalar_t* scalars, const bls12_381::g2_affine_t* points, int msm_size, const MSMConfig* config, bls12_381::g2_projective_t* out); + const bls12_381::scalar_t* scalars, + const bls12_381::g2_affine_t* points, + int msm_size, + const MSMConfig* config, + bls12_381::g2_projective_t* out); extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size); extern "C" void bls12_381_scalar_convert_montgomery( - const bls12_381::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bls12_381::scalar_t* output); + const bls12_381::scalar_t* input, + uint64_t size, + bool is_into, + const VecOpsConfig* config, + bls12_381::scalar_t* output); -extern "C" eIcicleError bls12_381_ntt_init_domain( - bls12_381::scalar_t* primitive_root, const NTTInitDomainConfig* config); +extern "C" eIcicleError +bls12_381_ntt_init_domain(bls12_381::scalar_t* primitive_root, const NTTInitDomainConfig* config); extern "C" eIcicleError bls12_381_ntt( - const bls12_381::scalar_t* input, int size, NTTDir dir, const NTTConfig* config, bls12_381::scalar_t* output); + const bls12_381::scalar_t* input, + int size, + NTTDir dir, + const NTTConfig* config, + bls12_381::scalar_t* output); extern "C" eIcicleError bls12_381_ntt_release_domain(); extern "C" eIcicleError bls12_381_vector_mul( - const bls12_381::scalar_t* vec_a, const bls12_381::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bls12_381::scalar_t* result); + const bls12_381::scalar_t* vec_a, + const bls12_381::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + bls12_381::scalar_t* result); extern "C" eIcicleError bls12_381_vector_add( - const bls12_381::scalar_t* vec_a, const bls12_381::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bls12_381::scalar_t* result); + const bls12_381::scalar_t* vec_a, + const bls12_381::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + bls12_381::scalar_t* result); extern "C" eIcicleError bls12_381_vector_sub( - const bls12_381::scalar_t* vec_a, const bls12_381::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bls12_381::scalar_t* result); + const bls12_381::scalar_t* vec_a, + const bls12_381::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + bls12_381::scalar_t* result); extern "C" eIcicleError bls12_381_matrix_transpose( const bls12_381::scalar_t* input, @@ -91,5 +128,3 @@ extern "C" eIcicleError bls12_381_matrix_transpose( extern "C" eIcicleError bls12_381_bit_reverse( const bls12_381::scalar_t* input, uint64_t n, const VecOpsConfig* config, bls12_381::scalar_t* output); - - diff --git a/icicle/include/icicle/api/bn254.h b/icicle/include/icicle/api/bn254.h index 928cb639e..f3aad8d53 100644 --- a/icicle/include/icicle/api/bn254.h +++ b/icicle/include/icicle/api/bn254.h @@ -22,20 +22,28 @@ extern "C" eIcicleError bn254_g2_affine_convert_montgomery( const bn254::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::g2_affine_t* output); extern "C" eIcicleError bn254_g2_projective_convert_montgomery( - const bn254::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::g2_projective_t* output); + const bn254::g2_projective_t* input, + size_t n, + bool is_into, + const VecOpsConfig* config, + bn254::g2_projective_t* output); extern "C" eIcicleError bn254_ecntt( - const bn254::projective_t* input, int size, NTTDir dir, const NTTConfig* config, bn254::projective_t* output); - + const bn254::projective_t* input, + int size, + NTTDir dir, + const NTTConfig* config, + bn254::projective_t* output); extern "C" eIcicleError bn254_precompute_msm_bases( - const bn254::affine_t* bases, - int nof_bases, - const MSMConfig* config, - bn254::affine_t* output_bases); + const bn254::affine_t* bases, int nof_bases, const MSMConfig* config, bn254::affine_t* output_bases); extern "C" eIcicleError bn254_msm( - const bn254::scalar_t* scalars, const bn254::affine_t* points, int msm_size, const MSMConfig* config, bn254::projective_t* out); + const bn254::scalar_t* scalars, + const bn254::affine_t* points, + int msm_size, + const MSMConfig* config, + bn254::projective_t* out); extern "C" bool bn254_eq(bn254::projective_t* point1, bn254::projective_t* point2); @@ -49,38 +57,54 @@ extern "C" eIcicleError bn254_affine_convert_montgomery( const bn254::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::affine_t* output); extern "C" eIcicleError bn254_projective_convert_montgomery( - const bn254::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::projective_t* output); + const bn254::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::projective_t* output); extern "C" eIcicleError bn254_g2_precompute_msm_bases( - const bn254::g2_affine_t* bases, - int nof_bases, - const MSMConfig* config, - bn254::g2_affine_t* output_bases); + const bn254::g2_affine_t* bases, int nof_bases, const MSMConfig* config, bn254::g2_affine_t* output_bases); extern "C" eIcicleError bn254_g2_msm( - const bn254::scalar_t* scalars, const bn254::g2_affine_t* points, int msm_size, const MSMConfig* config, bn254::g2_projective_t* out); + const bn254::scalar_t* scalars, + const bn254::g2_affine_t* points, + int msm_size, + const MSMConfig* config, + bn254::g2_projective_t* out); extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size); extern "C" void bn254_scalar_convert_montgomery( const bn254::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bn254::scalar_t* output); -extern "C" eIcicleError bn254_ntt_init_domain( - bn254::scalar_t* primitive_root, const NTTInitDomainConfig* config); +extern "C" eIcicleError bn254_ntt_init_domain(bn254::scalar_t* primitive_root, const NTTInitDomainConfig* config); extern "C" eIcicleError bn254_ntt( - const bn254::scalar_t* input, int size, NTTDir dir, const NTTConfig* config, bn254::scalar_t* output); + const bn254::scalar_t* input, + int size, + NTTDir dir, + const NTTConfig* config, + bn254::scalar_t* output); extern "C" eIcicleError bn254_ntt_release_domain(); extern "C" eIcicleError bn254_vector_mul( - const bn254::scalar_t* vec_a, const bn254::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result); + const bn254::scalar_t* vec_a, + const bn254::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + bn254::scalar_t* result); extern "C" eIcicleError bn254_vector_add( - const bn254::scalar_t* vec_a, const bn254::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result); + const bn254::scalar_t* vec_a, + const bn254::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + bn254::scalar_t* result); extern "C" eIcicleError bn254_vector_sub( - const bn254::scalar_t* vec_a, const bn254::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result); + const bn254::scalar_t* vec_a, + const bn254::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + bn254::scalar_t* result); extern "C" eIcicleError bn254_matrix_transpose( const bn254::scalar_t* input, @@ -89,7 +113,5 @@ extern "C" eIcicleError bn254_matrix_transpose( const VecOpsConfig* config, bn254::scalar_t* output); -extern "C" eIcicleError bn254_bit_reverse( - const bn254::scalar_t* input, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* output); - - +extern "C" eIcicleError +bn254_bit_reverse(const bn254::scalar_t* input, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* output); diff --git a/icicle/include/icicle/api/bw6_761.h b/icicle/include/icicle/api/bw6_761.h index 6b48606a2..0147091e5 100644 --- a/icicle/include/icicle/api/bw6_761.h +++ b/icicle/include/icicle/api/bw6_761.h @@ -22,20 +22,28 @@ extern "C" eIcicleError bw6_761_g2_affine_convert_montgomery( const bw6_761::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::g2_affine_t* output); extern "C" eIcicleError bw6_761_g2_projective_convert_montgomery( - const bw6_761::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::g2_projective_t* output); + const bw6_761::g2_projective_t* input, + size_t n, + bool is_into, + const VecOpsConfig* config, + bw6_761::g2_projective_t* output); extern "C" eIcicleError bw6_761_ecntt( - const bw6_761::projective_t* input, int size, NTTDir dir, const NTTConfig* config, bw6_761::projective_t* output); - + const bw6_761::projective_t* input, + int size, + NTTDir dir, + const NTTConfig* config, + bw6_761::projective_t* output); extern "C" eIcicleError bw6_761_precompute_msm_bases( - const bw6_761::affine_t* bases, - int nof_bases, - const MSMConfig* config, - bw6_761::affine_t* output_bases); + const bw6_761::affine_t* bases, int nof_bases, const MSMConfig* config, bw6_761::affine_t* output_bases); extern "C" eIcicleError bw6_761_msm( - const bw6_761::scalar_t* scalars, const bw6_761::affine_t* points, int msm_size, const MSMConfig* config, bw6_761::projective_t* out); + const bw6_761::scalar_t* scalars, + const bw6_761::affine_t* points, + int msm_size, + const MSMConfig* config, + bw6_761::projective_t* out); extern "C" bool bw6_761_eq(bw6_761::projective_t* point1, bw6_761::projective_t* point2); @@ -49,38 +57,58 @@ extern "C" eIcicleError bw6_761_affine_convert_montgomery( const bw6_761::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::affine_t* output); extern "C" eIcicleError bw6_761_projective_convert_montgomery( - const bw6_761::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::projective_t* output); + const bw6_761::projective_t* input, + size_t n, + bool is_into, + const VecOpsConfig* config, + bw6_761::projective_t* output); extern "C" eIcicleError bw6_761_g2_precompute_msm_bases( - const bw6_761::g2_affine_t* bases, - int nof_bases, - const MSMConfig* config, - bw6_761::g2_affine_t* output_bases); + const bw6_761::g2_affine_t* bases, int nof_bases, const MSMConfig* config, bw6_761::g2_affine_t* output_bases); extern "C" eIcicleError bw6_761_g2_msm( - const bw6_761::scalar_t* scalars, const bw6_761::g2_affine_t* points, int msm_size, const MSMConfig* config, bw6_761::g2_projective_t* out); + const bw6_761::scalar_t* scalars, + const bw6_761::g2_affine_t* points, + int msm_size, + const MSMConfig* config, + bw6_761::g2_projective_t* out); extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size); extern "C" void bw6_761_scalar_convert_montgomery( const bw6_761::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bw6_761::scalar_t* output); -extern "C" eIcicleError bw6_761_ntt_init_domain( - bw6_761::scalar_t* primitive_root, const NTTInitDomainConfig* config); +extern "C" eIcicleError bw6_761_ntt_init_domain(bw6_761::scalar_t* primitive_root, const NTTInitDomainConfig* config); extern "C" eIcicleError bw6_761_ntt( - const bw6_761::scalar_t* input, int size, NTTDir dir, const NTTConfig* config, bw6_761::scalar_t* output); + const bw6_761::scalar_t* input, + int size, + NTTDir dir, + const NTTConfig* config, + bw6_761::scalar_t* output); extern "C" eIcicleError bw6_761_ntt_release_domain(); extern "C" eIcicleError bw6_761_vector_mul( - const bw6_761::scalar_t* vec_a, const bw6_761::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bw6_761::scalar_t* result); + const bw6_761::scalar_t* vec_a, + const bw6_761::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + bw6_761::scalar_t* result); extern "C" eIcicleError bw6_761_vector_add( - const bw6_761::scalar_t* vec_a, const bw6_761::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bw6_761::scalar_t* result); + const bw6_761::scalar_t* vec_a, + const bw6_761::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + bw6_761::scalar_t* result); extern "C" eIcicleError bw6_761_vector_sub( - const bw6_761::scalar_t* vec_a, const bw6_761::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bw6_761::scalar_t* result); + const bw6_761::scalar_t* vec_a, + const bw6_761::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + bw6_761::scalar_t* result); extern "C" eIcicleError bw6_761_matrix_transpose( const bw6_761::scalar_t* input, @@ -89,7 +117,5 @@ extern "C" eIcicleError bw6_761_matrix_transpose( const VecOpsConfig* config, bw6_761::scalar_t* output); -extern "C" eIcicleError bw6_761_bit_reverse( - const bw6_761::scalar_t* input, uint64_t n, const VecOpsConfig* config, bw6_761::scalar_t* output); - - +extern "C" eIcicleError +bw6_761_bit_reverse(const bw6_761::scalar_t* input, uint64_t n, const VecOpsConfig* config, bw6_761::scalar_t* output); diff --git a/icicle/include/icicle/api/grumpkin.h b/icicle/include/icicle/api/grumpkin.h index 42b1b2195..4c308e5c3 100644 --- a/icicle/include/icicle/api/grumpkin.h +++ b/icicle/include/icicle/api/grumpkin.h @@ -10,13 +10,14 @@ #include "icicle/vec_ops.h" extern "C" eIcicleError grumpkin_precompute_msm_bases( - const grumpkin::affine_t* bases, - int nof_bases, - const MSMConfig* config, - grumpkin::affine_t* output_bases); + const grumpkin::affine_t* bases, int nof_bases, const MSMConfig* config, grumpkin::affine_t* output_bases); extern "C" eIcicleError grumpkin_msm( - const grumpkin::scalar_t* scalars, const grumpkin::affine_t* points, int msm_size, const MSMConfig* config, grumpkin::projective_t* out); + const grumpkin::scalar_t* scalars, + const grumpkin::affine_t* points, + int msm_size, + const MSMConfig* config, + grumpkin::projective_t* out); extern "C" bool grumpkin_eq(grumpkin::projective_t* point1, grumpkin::projective_t* point2); @@ -30,7 +31,11 @@ extern "C" eIcicleError grumpkin_affine_convert_montgomery( const grumpkin::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, grumpkin::affine_t* output); extern "C" eIcicleError grumpkin_projective_convert_montgomery( - const grumpkin::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, grumpkin::projective_t* output); + const grumpkin::projective_t* input, + size_t n, + bool is_into, + const VecOpsConfig* config, + grumpkin::projective_t* output); extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size); @@ -38,13 +43,25 @@ extern "C" void grumpkin_scalar_convert_montgomery( const grumpkin::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, grumpkin::scalar_t* output); extern "C" eIcicleError grumpkin_vector_mul( - const grumpkin::scalar_t* vec_a, const grumpkin::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, grumpkin::scalar_t* result); + const grumpkin::scalar_t* vec_a, + const grumpkin::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + grumpkin::scalar_t* result); extern "C" eIcicleError grumpkin_vector_add( - const grumpkin::scalar_t* vec_a, const grumpkin::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, grumpkin::scalar_t* result); + const grumpkin::scalar_t* vec_a, + const grumpkin::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + grumpkin::scalar_t* result); extern "C" eIcicleError grumpkin_vector_sub( - const grumpkin::scalar_t* vec_a, const grumpkin::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, grumpkin::scalar_t* result); + const grumpkin::scalar_t* vec_a, + const grumpkin::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + grumpkin::scalar_t* result); extern "C" eIcicleError grumpkin_matrix_transpose( const grumpkin::scalar_t* input, @@ -55,5 +72,3 @@ extern "C" eIcicleError grumpkin_matrix_transpose( extern "C" eIcicleError grumpkin_bit_reverse( const grumpkin::scalar_t* input, uint64_t n, const VecOpsConfig* config, grumpkin::scalar_t* output); - - diff --git a/icicle/include/icicle/api/stark252.h b/icicle/include/icicle/api/stark252.h index 6a8ff1a74..5020a5966 100644 --- a/icicle/include/icicle/api/stark252.h +++ b/icicle/include/icicle/api/stark252.h @@ -14,22 +14,37 @@ extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size) extern "C" void stark252_scalar_convert_montgomery( const stark252::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, stark252::scalar_t* output); -extern "C" eIcicleError stark252_ntt_init_domain( - stark252::scalar_t* primitive_root, const NTTInitDomainConfig* config); +extern "C" eIcicleError stark252_ntt_init_domain(stark252::scalar_t* primitive_root, const NTTInitDomainConfig* config); extern "C" eIcicleError stark252_ntt( - const stark252::scalar_t* input, int size, NTTDir dir, const NTTConfig* config, stark252::scalar_t* output); + const stark252::scalar_t* input, + int size, + NTTDir dir, + const NTTConfig* config, + stark252::scalar_t* output); extern "C" eIcicleError stark252_ntt_release_domain(); extern "C" eIcicleError stark252_vector_mul( - const stark252::scalar_t* vec_a, const stark252::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, stark252::scalar_t* result); + const stark252::scalar_t* vec_a, + const stark252::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + stark252::scalar_t* result); extern "C" eIcicleError stark252_vector_add( - const stark252::scalar_t* vec_a, const stark252::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, stark252::scalar_t* result); + const stark252::scalar_t* vec_a, + const stark252::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + stark252::scalar_t* result); extern "C" eIcicleError stark252_vector_sub( - const stark252::scalar_t* vec_a, const stark252::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, stark252::scalar_t* result); + const stark252::scalar_t* vec_a, + const stark252::scalar_t* vec_b, + uint64_t n, + const VecOpsConfig* config, + stark252::scalar_t* result); extern "C" eIcicleError stark252_matrix_transpose( const stark252::scalar_t* input, @@ -40,5 +55,3 @@ extern "C" eIcicleError stark252_matrix_transpose( extern "C" eIcicleError stark252_bit_reverse( const stark252::scalar_t* input, uint64_t n, const VecOpsConfig* config, stark252::scalar_t* output); - - diff --git a/icicle/include/icicle/backend/vec_ops_backend.h b/icicle/include/icicle/backend/vec_ops_backend.h index 92610798f..b602e2644 100644 --- a/icicle/include/icicle/backend/vec_ops_backend.h +++ b/icicle/include/icicle/backend/vec_ops_backend.h @@ -16,11 +16,7 @@ namespace icicle { scalar_t* output)>; using vectorVectorOpImplInplaceA = std::function; + const Device& device, scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config)>; using scalarConvertMontgomeryImpl = std::function; using VectorReduceOpImpl = std::function; + const Device& device, const scalar_t* vec_a, uint64_t size, const VecOpsConfig& config, scalar_t* output)>; using scalarVectorOpImpl = std::function; using scalarBitReverseOpImpl = std::function; + const Device& device, const scalar_t* input, uint64_t size, const VecOpsConfig& config, scalar_t* output)>; using scalarSliceOpImpl = std::function; using scalarHighNonZeroIdxOpImpl = std::function; + const Device& device, const scalar_t* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx)>; using scalarPolyEvalImpl = std::function; - - - void register_vector_add(const std::string& deviceType, vectorVectorOpImpl impl); #define REGISTER_VECTOR_ADD_BACKEND(DEVICE_TYPE, FUNC) \ @@ -173,10 +154,10 @@ namespace icicle { void register_vector_product(const std::string& deviceType, VectorReduceOpImpl impl); -#define REGISTER_VECTOR_PRODUCT_BACKEND(DEVICE_TYPE, FUNC) \ +#define REGISTER_VECTOR_PRODUCT_BACKEND(DEVICE_TYPE, FUNC) \ namespace { \ - static bool UNIQUE(_reg_vec_product) = []() -> bool { \ - register_vector_product(DEVICE_TYPE, FUNC); \ + static bool UNIQUE(_reg_vec_product) = []() -> bool { \ + register_vector_product(DEVICE_TYPE, FUNC); \ return true; \ }(); \ } diff --git a/icicle/include/icicle/fields/complex_extension.h b/icicle/include/icicle/fields/complex_extension.h index 9b4d35d24..6495822bd 100644 --- a/icicle/include/icicle/fields/complex_extension.h +++ b/icicle/include/icicle/fields/complex_extension.h @@ -36,9 +36,15 @@ class ComplexExtensionField FF real; FF imaginary; - static constexpr HOST_DEVICE_INLINE ComplexExtensionField zero() { return ComplexExtensionField{FF::zero(), FF::zero()}; } + static constexpr HOST_DEVICE_INLINE ComplexExtensionField zero() + { + return ComplexExtensionField{FF::zero(), FF::zero()}; + } - static constexpr HOST_DEVICE_INLINE ComplexExtensionField one() { return ComplexExtensionField{FF::one(), FF::zero()}; } + static constexpr HOST_DEVICE_INLINE ComplexExtensionField one() + { + return ComplexExtensionField{FF::one(), FF::zero()}; + } static constexpr HOST_DEVICE_INLINE ComplexExtensionField to_montgomery(const ComplexExtensionField& xs) { @@ -50,7 +56,10 @@ class ComplexExtensionField return ComplexExtensionField{xs.real * FF{CONFIG::montgomery_r_inv}, xs.imaginary * FF{CONFIG::montgomery_r_inv}}; } - static HOST_INLINE ComplexExtensionField rand_host() { return ComplexExtensionField{FF::rand_host(), FF::rand_host()}; } + static HOST_INLINE ComplexExtensionField rand_host() + { + return ComplexExtensionField{FF::rand_host(), FF::rand_host()}; + } static void rand_host_many(ComplexExtensionField* out, int size) { @@ -61,7 +70,8 @@ class ComplexExtensionField template static constexpr HOST_DEVICE_INLINE ComplexExtensionField sub_modulus(const ComplexExtensionField& xs) { - return ComplexExtensionField{FF::sub_modulus(&xs.real), FF::sub_modulus(&xs.imaginary)}; + return ComplexExtensionField{ + FF::sub_modulus(&xs.real), FF::sub_modulus(&xs.imaginary)}; } friend std::ostream& operator<<(std::ostream& os, const ComplexExtensionField& xs) @@ -101,7 +111,8 @@ class ComplexExtensionField } template - static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ComplexExtensionField& xs, const ComplexExtensionField& ys) + static constexpr HOST_DEVICE_INLINE ExtensionWide + mul_wide(const ComplexExtensionField& xs, const ComplexExtensionField& ys) { FWide real_prod = FF::mul_wide(xs.real, ys.real); FWide imaginary_prod = FF::mul_wide(xs.imaginary, ys.imaginary); @@ -142,7 +153,10 @@ class ComplexExtensionField return (xs.real == ys.real) && (xs.imaginary == ys.imaginary); } - friend HOST_DEVICE_INLINE bool operator!=(const ComplexExtensionField& xs, const ComplexExtensionField& ys) { return !(xs == ys); } + friend HOST_DEVICE_INLINE bool operator!=(const ComplexExtensionField& xs, const ComplexExtensionField& ys) + { + return !(xs == ys); + } template static HOST_DEVICE_INLINE ComplexExtensionField mul_const(const ComplexExtensionField& xs) diff --git a/icicle/include/icicle/fields/quartic_extension.h b/icicle/include/icicle/fields/quartic_extension.h index 923b31f3a..2ba17c05c 100644 --- a/icicle/include/icicle/fields/quartic_extension.h +++ b/icicle/include/icicle/fields/quartic_extension.h @@ -119,7 +119,8 @@ class QuarticExtensionField } template - static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const QuarticExtensionField& xs, const QuarticExtensionField& ys) + static constexpr HOST_DEVICE_INLINE ExtensionWide + mul_wide(const QuarticExtensionField& xs, const QuarticExtensionField& ys) { if (CONFIG::nonresidue_is_negative) return ExtensionWide{ @@ -179,7 +180,10 @@ class QuarticExtensionField return (xs.real == ys.real) && (xs.im1 == ys.im1) && (xs.im2 == ys.im2) && (xs.im3 == ys.im3); } - friend HOST_DEVICE_INLINE bool operator!=(const QuarticExtensionField& xs, const QuarticExtensionField& ys) { return !(xs == ys); } + friend HOST_DEVICE_INLINE bool operator!=(const QuarticExtensionField& xs, const QuarticExtensionField& ys) + { + return !(xs == ys); + } template static constexpr HOST_DEVICE_INLINE QuarticExtensionField mul_unsigned(const QuarticExtensionField& xs) diff --git a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h index 0ee0e2d0f..c7e53b218 100644 --- a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h +++ b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h @@ -278,15 +278,7 @@ namespace icicle { config.is_result_on_device = true; ICICLE_CHECK(icicle::polynomial_division( - a_coeffs, - deg_a, - b_coeffs, - deg_b, - deg_a - deg_b + 1, - a_N, - config, - Q_coeffs, - R_coeffs)); + a_coeffs, deg_a, b_coeffs, deg_b, deg_a - deg_b + 1, a_N, config, Q_coeffs, R_coeffs)); } void quotient(PolyContext Q, PolyContext op_a, PolyContext op_b) override @@ -554,8 +546,8 @@ namespace icicle { config.is_result_on_device = true; config.is_async = true; config.stream = m_stream; - ICICLE_CHECK( - icicle::slice(get_context_storage_immutable(p), 0 /*offset*/, stride, poly_size, domain_size, config, d_evals)); + ICICLE_CHECK(icicle::slice( + get_context_storage_immutable(p), 0 /*offset*/, stride, poly_size, domain_size, config, d_evals)); } else { ICICLE_CHECK(icicle_memset(d_evals, 0, domain_size * sizeof(I))); auto ntt_config = default_ntt_config(); diff --git a/icicle/include/icicle/utils/modifiers.h b/icicle/include/icicle/utils/modifiers.h index 74520c9f9..ac62028a8 100644 --- a/icicle/include/icicle/utils/modifiers.h +++ b/icicle/include/icicle/utils/modifiers.h @@ -14,7 +14,7 @@ #define HOST_INLINE __host__ INLINE_MACRO #define DEVICE_INLINE __device__ INLINE_MACRO -#define HOST_DEVICE __host__ __device__ +#define HOST_DEVICE __host__ __device__ #define HOST_DEVICE_INLINE HOST_DEVICE INLINE_MACRO #else // not CUDA #define INLINE_MACRO diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h index b89327eb4..524cbcdc5 100644 --- a/icicle/include/icicle/vec_ops.h +++ b/icicle/include/icicle/vec_ops.h @@ -17,22 +17,21 @@ namespace icicle { * @note APIs with a single input, ignore input b. */ struct VecOpsConfig { - icicleStreamHandle stream; /** Stream for asynchronous execution. */ - bool is_a_on_device; /** True if `a` is on the device, false if it is not. Default value: false. */ - bool is_b_on_device; /** True if `b` is on the device, false if it is not. Default value: false. OPTIONAL. */ - bool is_result_on_device; /** If true, the output is preserved on the device, otherwise on the host. Default value: - false. */ - bool is_async; /** Whether to run the vector operations asynchronously. - If set to `true`, the function will be non-blocking and synchronization - must be explicitly managed using `cudaStreamSynchronize` or `cudaDeviceSynchronize`. - If set to `false`, the function will block the current CPU thread. */ - int batch_size; /** Number of vectors (or operations) to process in a batch. - Each vector operation will be performed independently on each batch element. - Default value: 1. */ - bool - columns_batch; /** True if the batched vectors are stored as columns in a 2D array (i.e., the vectors are strided in memory as columns of a matrix). - If false, the batched vectors are stored contiguously in memory (e.g., as rows or in a flat array). - Default value: false. */ + icicleStreamHandle stream; /** Stream for asynchronous execution. */ + bool is_a_on_device; /** True if `a` is on the device, false if it is not. Default value: false. */ + bool is_b_on_device; /** True if `b` is on the device, false if it is not. Default value: false. OPTIONAL. */ + bool is_result_on_device; /** If true, the output is preserved on the device, otherwise on the host. Default value: + false. */ + bool is_async; /** Whether to run the vector operations asynchronously. + If set to `true`, the function will be non-blocking and synchronization + must be explicitly managed using `cudaStreamSynchronize` or `cudaDeviceSynchronize`. + If set to `false`, the function will block the current CPU thread. */ + int batch_size; /** Number of vectors (or operations) to process in a batch. + Each vector operation will be performed independently on each batch element. + Default value: 1. */ + bool columns_batch; /** True if the batched vectors are stored as columns in a 2D array (i.e., the vectors are + strided in memory as columns of a matrix). If false, the batched vectors are stored + contiguously in memory (e.g., as rows or in a flat array). Default value: false. */ ConfigExtension* ext = nullptr; /** Backend-specific extension. */ }; @@ -93,7 +92,8 @@ namespace icicle { * @return eIcicleError Error code indicating success or failure. */ template - eIcicleError vector_accumulate(T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config); // use vector_add (inplace) + eIcicleError + vector_accumulate(T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config); // use vector_add (inplace) /** * @brief Subtracts vector `b` from vector `a` element-wise. @@ -172,7 +172,8 @@ namespace icicle { * @return eIcicleError Error code indicating success or failure. */ template - eIcicleError convert_montgomery(const T* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, T* output); + eIcicleError + convert_montgomery(const T* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, T* output); // Reduction operations @@ -195,25 +196,23 @@ namespace icicle { eIcicleError vector_sum(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output); /** - * @brief Computes the product of all elements in each vector in the batch. - * - * @tparam T Type of the elements in the vectors. - * @param vec_a Pointer to the input vector(s). - * - If `config.batch_size > 1`, this should be a concatenated array of vectors. - * - The layout depends on `config.columns_batch`: - * - If `false`, vectors are stored contiguously. - * - If `true`, vectors are stored as columns in a 2D array. - * @param size Number of elements in each vector. - * @param config Configuration for the operation. - * @param output Pointer to the output array where the results will be stored. - * @return eIcicleError Error code indicating success or failure. - */ + * @brief Computes the product of all elements in each vector in the batch. + * + * @tparam T Type of the elements in the vectors. + * @param vec_a Pointer to the input vector(s). + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously. + * - If `true`, vectors are stored as columns in a 2D array. + * @param size Number of elements in each vector. + * @param config Configuration for the operation. + * @param output Pointer to the output array where the results will be stored. + * @return eIcicleError Error code indicating success or failure. + */ template eIcicleError vector_product(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output); - - // Scalar-Vector operations /** @@ -222,21 +221,24 @@ namespace icicle { * @tparam T Type of the elements in the vector and the scalar. * @param scalar_a Pointer to the input scalar(s). * - If `use_single_scalar` is `true`, this should point to a single scalar value. - * - If `use_single_scalar` is `false`, this should point to an array of scalars with length `config.batch_size`. - * @param vec_b Pointer to the input vector(s). + * - If `use_single_scalar` is `false`, this should point to an array of scalars with length + * `config.batch_size`. + * @param vec_b Pointer to the input vector(s). * - If `config.batch_size > 1`, this should be a concatenated array of vectors. * - The layout depends on `config.columns_batch`: * - If `false`, vectors are stored contiguously. * - If `true`, vectors are stored as columns in a 2D array. * @param size Number of elements in a vector. - * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of scalars (`false`). + * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of + * scalars (`false`). * @param config Configuration for the operation. * @param output Pointer to the output vector(s) where the results will be stored. * @return eIcicleError Error code indicating success or failure. - * @note To subtract a scalar from each element of a vector - use scalar_add_vec with negative scalar. + * @note To subtract a scalar from each element of a vector - use scalar_add_vec with negative scalar. */ template - eIcicleError scalar_add_vec(const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output); + eIcicleError scalar_add_vec( + const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output); /** * @brief Subtracts each element of a vector from a scalar, elementwise (res[i]=scalar-vec[i]). @@ -244,21 +246,24 @@ namespace icicle { * @tparam T Type of the elements in the vector and the scalar. * @param scalar_a Pointer to Input scalar(s). * - If `use_single_scalar` is `true`, this should point to a single scalar value. - * - If `use_single_scalar` is `false`, this should point to an array of scalars with length `config.batch_size`. - * @param vec_b Pointer to the input vector(s). + * - If `use_single_scalar` is `false`, this should point to an array of scalars with length + * `config.batch_size`. + * @param vec_b Pointer to the input vector(s). * - If `config.batch_size > 1`, this should be a concatenated array of vectors. * - The layout depends on `config.columns_batch`: * - If `false`, vectors are stored contiguously. * - If `true`, vectors are stored as columns in a 2D array. * @param size Number of elements in a vector. - * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of scalars (`false`). + * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of + * scalars (`false`). * @param config Configuration for the operation. * @param output Pointer to the output vector(s) where the results will be stored. * @return eIcicleError Error code indicating success or failure. * @note To subtract a scalar from each element of a vector - use scalar_add_vec with negative scalar. */ template - eIcicleError scalar_sub_vec(const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output); + eIcicleError scalar_sub_vec( + const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output); /** * @brief Multiplies each element of a vector by a scalar. @@ -266,20 +271,23 @@ namespace icicle { * @tparam T Type of the elements in the vector and the scalar. * @param scalar_a Pointer to Input scalar(s). * - If `use_single_scalar` is `true`, this should point to a single scalar value. - * - If `use_single_scalar` is `false`, this should point to an array of scalars with length `config.batch_size`. - * @param vec_b Pointer to the input vector(s). + * - If `use_single_scalar` is `false`, this should point to an array of scalars with length + * `config.batch_size`. + * @param vec_b Pointer to the input vector(s). * - If `config.batch_size > 1`, this should be a concatenated array of vectors. * - The layout depends on `config.columns_batch`: * - If `false`, vectors are stored contiguously. * - If `true`, vectors are stored as columns in a 2D array. * @param size Number of elements in a vector. - * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of scalars (`false`). + * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of + * scalars (`false`). * @param config Configuration for the operation. * @param output Pointer to the output vector(s) where the results will be stored. * @return eIcicleError Error code indicating success or failure. */ template - eIcicleError scalar_mul_vec(const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output); + eIcicleError scalar_mul_vec( + const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output); // Matrix operations @@ -294,14 +302,13 @@ namespace icicle { * @param mat_out Pointer to the output matrix or matrices where the transposed matrices will be stored. * @return eIcicleError Error code indicating success or failure. * @note The input matrices are assumed to be stored in row-major order. - * This function transposes an input matrix or a batch of matrices. + * This function transposes an input matrix or a batch of matrices. * Matrix transpose inplace is not supported for non-power of 2 rows and columns. */ template eIcicleError matrix_transpose(const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out); - - + // Miscellaneous operations /** @@ -309,10 +316,10 @@ namespace icicle { * * @tparam T Type of the elements in the vector. * @param vec_in Pointer to the input vector(s). - * - If `config.batch_size > 1`, this should be a concatenated array of vectors. - * - The layout depends on `config.columns_batch`: - * - If `false`, vectors are stored contiguously. - * - If `true`, vectors are stored as columns in a 2D array. + * - If `config.batch_size > 1`, this should be a concatenated array of vectors. + * - The layout depends on `config.columns_batch`: + * - If `false`, vectors are stored contiguously. + * - If `true`, vectors are stored as columns in a 2D array. * @param size Number of elements in each vector. Must be a power of 2. * @param config Configuration for the operation. * @param vec_out Pointer to the output vector(s) where the results will be stored. @@ -332,7 +339,7 @@ namespace icicle { * @param stride Stride between elements in the slice. * @param size_in Number of elements in one input vector. * @param size_out Number of elements in one input vector. - * @param config Configuration for the operation. + * @param config Configuration for the operation. * @param vec_out Pointer to the output vector(s) where the results will be stored. * The output array should have the same storage layout as the input vectors. * @return eIcicleError Error code indicating success or failure. @@ -341,8 +348,14 @@ namespace icicle { * parameters must satisfy: offset + (size_out-1) * stride < size_in */ template - eIcicleError - slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size_in, uint64_t size_out, const VecOpsConfig& config, T* vec_out); + eIcicleError slice( + const T* vec_in, + uint64_t offset, + uint64_t stride, + uint64_t size_in, + uint64_t size_out, + const VecOpsConfig& config, + T* vec_out); /** * @brief Finds the highest non-zero index in a vector or batch of vectors. @@ -351,8 +364,8 @@ namespace icicle { * @param vec_in Pointer to the input vector(s). * @param size Number of elements in each input vector. * @param config Configuration for the operation. - * @param out_idx Pointer to an array where the output indices of the highest non-zero element in each input vector will be stored. - * The array should have a length of `config.batch_size`. + * @param out_idx Pointer to an array where the output indices of the highest non-zero element in each input vector + * will be stored. The array should have a length of `config.batch_size`. * @return eIcicleError Error code indicating success or failure. */ template @@ -364,18 +377,19 @@ namespace icicle { * @tparam T Type of the elements in the polynomial and domain. * @param coeffs Pointer to the array of coefficients of the polynomial(s). * - The size of `coeffs` should be `coeffs_size * batch_size`. - * - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored contiguously. + * - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored + * contiguously. * - If `config.columns_batch` is `true`, coefficients are interleaved. * @param coeffs_size Number of coefficients in each polynomial. * @param domain Pointer to the array of points at which to evaluate the polynomial(s). - * - The same domain is used for all polynomials. - * - The size of `domain` should be `domain_size`. + * - The same domain is used for all polynomials. + * - The size of `domain` should be `domain_size`. * @param domain_size Number of domain points. * @param config Configuration for the operation. * @param evals Pointer to the array where the evaluated results will be stored. This is an output parameter. - * - The size of `evals` should be `domain_size * batch_size`. - * - If `config.columns_batch` is `false`, results for each polynomial are stored contiguously. - * - If `config.columns_batch` is `true`, results are interleaved. + * - The size of `evals` should be `domain_size * batch_size`. + * - If `config.columns_batch` is `false`, results for each polynomial are stored contiguously. + * - If `config.columns_batch` is `true`, results are interleaved. * @return eIcicleError Error code indicating success or failure. */ template @@ -393,7 +407,8 @@ namespace icicle { * @tparam T Type of the elements in the polynomials. * @param numerator Pointer to the array of coefficients of the numerator polynomial(s). * - The size of `numerator` should be `(numerator_deg + 1) * batch_size`. - * - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored contiguously. + * - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored + * contiguously. * - If `config.columns_batch` is `true`, coefficients are interleaved. * @param numerator_deg Degree of the numerator polynomial. * @param denominator Pointer to the array of coefficients of the denominator polynomial(s). @@ -410,8 +425,8 @@ namespace icicle { * @return eIcicleError Error code indicating success or failure. * * @note The degrees should satisfy `numerator_deg >= denominator_deg`. - * The sizes `q_size` and `r_size` must be at least `numerator_deg - denominator_deg + 1` and `denominator_deg`, respectively. - * The function assumes that the input and output arrays are properly allocated. + * The sizes `q_size` and `r_size` must be at least `numerator_deg - denominator_deg + 1` and `denominator_deg`, + * respectively. The function assumes that the input and output arrays are properly allocated. */ template eIcicleError polynomial_division( diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp index 2c16ed389..c97fe3e1f 100644 --- a/icicle/src/vec_ops.cpp +++ b/icicle/src/vec_ops.cpp @@ -3,7 +3,6 @@ namespace icicle { - /*********************************** REDUCE PRODUCT ************************/ ICICLE_DISPATCHER_INST(VectorProductDispatcher, vector_product, VectorReduceOpImpl); @@ -14,24 +13,22 @@ namespace icicle { } template <> - eIcicleError - vector_product(const scalar_t* vec_a, uint64_t size, const VecOpsConfig& config, scalar_t* output) + eIcicleError vector_product(const scalar_t* vec_a, uint64_t size, const VecOpsConfig& config, scalar_t* output) { return CONCAT_EXPAND(FIELD, vector_product)(vec_a, size, &config, output); } /*********************************** REDUCE SUM ****************************/ - ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, VectorReduceOpImpl ); + ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, VectorReduceOpImpl); - extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_sum)( - const scalar_t* vec_a, uint64_t size, const VecOpsConfig* config, scalar_t* output) + extern "C" eIcicleError + CONCAT_EXPAND(FIELD, vector_sum)(const scalar_t* vec_a, uint64_t size, const VecOpsConfig* config, scalar_t* output) { return VectorSumDispatcher::execute(vec_a, size, *config, output); } template <> - eIcicleError - vector_sum(const scalar_t* vec_a, uint64_t size, const VecOpsConfig& config, scalar_t* output) + eIcicleError vector_sum(const scalar_t* vec_a, uint64_t size, const VecOpsConfig& config, scalar_t* output) { return CONCAT_EXPAND(FIELD, vector_sum)(vec_a, size, &config, output); } @@ -94,7 +91,8 @@ namespace icicle { } template <> - eIcicleError vector_accumulate(extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config) + eIcicleError + vector_accumulate(extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config) { return CONCAT_EXPAND(FIELD, extension_vector_accumulate)(vec_a, vec_b, size, &config); } @@ -186,14 +184,24 @@ namespace icicle { ICICLE_DISPATCHER_INST(ScalarAddDispatcher, scalar_add_vec, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_add_vec)( - const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig* config, scalar_t* output) + const scalar_t* scalar_a, + const scalar_t* vec_b, + uint64_t size, + bool use_single_scalar, + const VecOpsConfig* config, + scalar_t* output) { return ScalarAddDispatcher::execute(scalar_a, vec_b, size, use_single_scalar, *config, output); } template <> eIcicleError scalar_add_vec( - const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, scalar_t* output) + const scalar_t* scalar_a, + const scalar_t* vec_b, + uint64_t size, + bool use_single_scalar, + const VecOpsConfig& config, + scalar_t* output) { return CONCAT_EXPAND(FIELD, scalar_add_vec)(scalar_a, vec_b, size, use_single_scalar, &config, output); } @@ -202,14 +210,24 @@ namespace icicle { ICICLE_DISPATCHER_INST(ScalarSubDispatcher, scalar_sub_vec, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_sub_vec)( - const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig* config, scalar_t* output) + const scalar_t* scalar_a, + const scalar_t* vec_b, + uint64_t size, + bool use_single_scalar, + const VecOpsConfig* config, + scalar_t* output) { return ScalarSubDispatcher::execute(scalar_a, vec_b, size, use_single_scalar, *config, output); } template <> eIcicleError scalar_sub_vec( - const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, scalar_t* output) + const scalar_t* scalar_a, + const scalar_t* vec_b, + uint64_t size, + bool use_single_scalar, + const VecOpsConfig& config, + scalar_t* output) { return CONCAT_EXPAND(FIELD, scalar_sub_vec)(scalar_a, vec_b, size, use_single_scalar, &config, output); } @@ -217,14 +235,24 @@ namespace icicle { ICICLE_DISPATCHER_INST(ScalarMulDispatcher, scalar_mul_vec, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_mul_vec)( - const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig* config, scalar_t* output) + const scalar_t* scalar_a, + const scalar_t* vec_b, + uint64_t size, + bool use_single_scalar, + const VecOpsConfig* config, + scalar_t* output) { return ScalarMulDispatcher::execute(scalar_a, vec_b, size, use_single_scalar, *config, output); } template <> eIcicleError scalar_mul_vec( - const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, scalar_t* output) + const scalar_t* scalar_a, + const scalar_t* vec_b, + uint64_t size, + bool use_single_scalar, + const VecOpsConfig& config, + scalar_t* output) { return CONCAT_EXPAND(FIELD, scalar_mul_vec)(scalar_a, vec_b, size, use_single_scalar, &config, output); } @@ -240,8 +268,8 @@ namespace icicle { } template <> - eIcicleError - convert_montgomery(const scalar_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, scalar_t* output) + eIcicleError convert_montgomery( + const scalar_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, scalar_t* output) { return CONCAT_EXPAND(FIELD, scalar_convert_montgomery)(input, size, is_to_montgomery, &config, output); } @@ -431,4 +459,4 @@ namespace icicle { numerator, numerator_deg, denumerator, denumerator_deg, q_size, r_size, &config, q_out, r_out); } -} // sizeamespace icicle \ No newline at end of file +} // namespace icicle \ No newline at end of file diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 5aa9dd973..50d4b0d8f 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -6,7 +6,6 @@ #include #include // For system - #include "icicle/runtime.h" #include "icicle/vec_ops.h" #include "icicle/ntt.h" @@ -31,7 +30,6 @@ static inline std::string s_main_target; static inline std::string s_reference_target; static const bool s_is_cuda_registered = is_device_registered("CUDA"); - template class FieldApiTest : public ::testing::Test { @@ -89,7 +87,6 @@ TYPED_TEST(FieldApiTest, FieldSanityTest) ASSERT_EQ(a * scalar_t::from(2), a + a); } - TYPED_TEST(FieldApiTest, vectorVectorOps) { int seed = time(0); @@ -102,7 +99,7 @@ TYPED_TEST(FieldApiTest, vectorVectorOps) auto in_a = std::make_unique(total_size); auto in_b = std::make_unique(total_size); auto out_main = std::make_unique(total_size); - auto out_ref = std::make_unique(total_size); + auto out_ref = std::make_unique(total_size); auto vector_accumulate_wrapper = [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) { @@ -127,74 +124,83 @@ TYPED_TEST(FieldApiTest, vectorVectorOps) END_TIMER(VECADD_sync, oss.str().c_str(), measure); }; - // warmup // run(s_reference_target, out_ref.get(), false /*=measure*/, 16 /*=iters*/); // run(s_main_target, out_main.get(), false /*=measure*/, 1 /*=iters*/); - + // warmup // run(s_reference_target, out_ref.get(), false /*=measure*/, 16 /*=iters*/); // run(s_main_target, out_main.get(), false /*=measure*/, 1 /*=iters*/); - + // Element-wise vector operations - // If config.batch_size>1, (columns_batch=true or false) the operation is done element-wise anyway, so it doesn't affect the test + // If config.batch_size>1, (columns_batch=true or false) the operation is done element-wise anyway, so it doesn't + // affect the test // // add - FieldApiTest::random_samples(in_a.get(),total_size); - FieldApiTest::random_samples(in_b.get(),total_size); + FieldApiTest::random_samples(in_a.get(), total_size); + FieldApiTest::random_samples(in_b.get(), total_size); if (!s_is_cuda_registered) { - for (int i = 0; i < total_size; i++) { out_ref[i] = in_a[i] + in_b[i]; } + for (int i = 0; i < total_size; i++) { + out_ref[i] = in_a[i] + in_b[i]; + } } else { run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_add, "vector add", ITERS); } run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_add, "vector add", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); - + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); + // // accumulate - FieldApiTest::random_samples(in_a.get(),total_size); - FieldApiTest::random_samples(in_b.get(),total_size); + FieldApiTest::random_samples(in_a.get(), total_size); + FieldApiTest::random_samples(in_b.get(), total_size); if (!s_is_cuda_registered) { - for (int i = 0; i < total_size; i++) { out_ref[i] = in_a[i] + in_b[i]; } + for (int i = 0; i < total_size; i++) { + out_ref[i] = in_a[i] + in_b[i]; + } } else { run(s_reference_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS); } run(s_main_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS); - ASSERT_EQ(0, memcmp(in_a.get(), out_ref.get(), total_size * sizeof(TypeParam))); + ASSERT_EQ(0, memcmp(in_a.get(), out_ref.get(), total_size * sizeof(TypeParam))); // // sub - FieldApiTest::random_samples(in_a.get(),total_size); - FieldApiTest::random_samples(in_b.get(),total_size); + FieldApiTest::random_samples(in_a.get(), total_size); + FieldApiTest::random_samples(in_b.get(), total_size); if (!s_is_cuda_registered) { - for (int i = 0; i < total_size; i++) { out_ref[i] = in_a[i] - in_b[i]; } + for (int i = 0; i < total_size; i++) { + out_ref[i] = in_a[i] - in_b[i]; + } } else { run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sub, "vector sub", ITERS); } run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sub, "vector sub", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); // // mul - FieldApiTest::random_samples(in_a.get(),total_size); - FieldApiTest::random_samples(in_b.get(),total_size); + FieldApiTest::random_samples(in_a.get(), total_size); + FieldApiTest::random_samples(in_b.get(), total_size); if (!s_is_cuda_registered) { - for (int i = 0; i < total_size; i++) { out_ref[i] = in_a[i] * in_b[i]; } + for (int i = 0; i < total_size; i++) { + out_ref[i] = in_a[i] * in_b[i]; + } } else { run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_mul, "vector mul", ITERS); } run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_mul, "vector mul", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); // // div - FieldApiTest::random_samples(in_a.get(),total_size); - FieldApiTest::random_samples(in_b.get(),total_size); + FieldApiTest::random_samples(in_a.get(), total_size); + FieldApiTest::random_samples(in_b.get(), total_size); // reference if (!s_is_cuda_registered) { - for (int i = 0; i < total_size; i++) { out_ref[i] = in_a[i] * TypeParam::inverse(in_b[i]); } + for (int i = 0; i < total_size; i++) { + out_ref[i] = in_a[i] * TypeParam::inverse(in_b[i]); + } } else { run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_div, "vector div", ITERS); } run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_div, "vector div", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); - + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); } TYPED_TEST(FieldApiTest, montgomeryConversion) @@ -209,7 +215,7 @@ TYPED_TEST(FieldApiTest, montgomeryConversion) const int total_size = N * batch_size; auto in_a = std::make_unique(total_size); auto out_main = std::make_unique(total_size); - auto out_ref = std::make_unique(total_size); + auto out_ref = std::make_unique(total_size); auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) { Device dev = {dev_type, 0}; @@ -229,22 +235,29 @@ TYPED_TEST(FieldApiTest, montgomeryConversion) }; // Element-wise operation - // If config.batch_size>1, (columns_batch=true or false) the addition is done element-wise anyway, so it doesn't affect the test + // If config.batch_size>1, (columns_batch=true or false) the addition is done element-wise anyway, so it doesn't + // affect the test // convert_montgomery - FieldApiTest::random_samples(in_a.get(),total_size); + FieldApiTest::random_samples(in_a.get(), total_size); // reference if (!s_is_cuda_registered) { - if (is_to_montgomery) { for (int i = 0; i < total_size; i++) { out_ref[i] = TypeParam::to_montgomery(in_a[i]); } } - else { for (int i = 0; i < total_size; i++) { out_ref[i] = TypeParam::from_montgomery(in_a[i]); } } + if (is_to_montgomery) { + for (int i = 0; i < total_size; i++) { + out_ref[i] = TypeParam::to_montgomery(in_a[i]); + } + } else { + for (int i = 0; i < total_size; i++) { + out_ref[i] = TypeParam::from_montgomery(in_a[i]); + } + } } else { run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "montgomery", ITERS); } run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "montgomery", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); } - TYPED_TEST(FieldApiTest, VectorReduceOps) { int seed = time(0); @@ -256,7 +269,7 @@ TYPED_TEST(FieldApiTest, VectorReduceOps) const int total_size = N * batch_size; auto in_a = std::make_unique(total_size); auto out_main = std::make_unique(batch_size); - auto out_ref = std::make_unique(batch_size); + auto out_ref = std::make_unique(batch_size); auto vector_accumulate_wrapper = [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) { @@ -280,44 +293,43 @@ TYPED_TEST(FieldApiTest, VectorReduceOps) } END_TIMER(VECADD_sync, oss.str().c_str(), measure); }; - + // // sum - FieldApiTest::random_samples(in_a.get(),total_size); + FieldApiTest::random_samples(in_a.get(), total_size); // reference for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { - out_ref[idx_in_batch] = TypeParam::from(0); + out_ref[idx_in_batch] = TypeParam::from(0); } if (!s_is_cuda_registered) { for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) { - uint64_t idx_a = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N; - out_ref[idx_in_batch] = out_ref[idx_in_batch] + in_a[idx_a]; + uint64_t idx_a = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N; + out_ref[idx_in_batch] = out_ref[idx_in_batch] + in_a[idx_a]; } } } else { run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sum, "vector sum", ITERS); } run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sum, "vector sum", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(TypeParam))); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(TypeParam))); - // // product - FieldApiTest::random_samples(in_a.get(),total_size); + FieldApiTest::random_samples(in_a.get(), total_size); if (!s_is_cuda_registered) { for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { out_ref[idx_in_batch] = TypeParam::from(1); } for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) { - uint64_t idx_a = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N; - out_ref[idx_in_batch] = out_ref[idx_in_batch]*in_a[idx_a]; + uint64_t idx_a = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N; + out_ref[idx_in_batch] = out_ref[idx_in_batch] * in_a[idx_a]; } } } else { run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_product, "vector product", ITERS); } run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_product, "vector product", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(TypeParam))); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(TypeParam))); } TYPED_TEST(FieldApiTest, scalarVectorOps) @@ -330,10 +342,10 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) const bool columns_batch = rand() % 2; const bool use_single_scalar = rand() % 2; const int total_size = N * batch_size; - auto scalar_a = std::make_unique(use_single_scalar? 1 : batch_size); + auto scalar_a = std::make_unique(use_single_scalar ? 1 : batch_size); auto in_b = std::make_unique(total_size); auto out_main = std::make_unique(total_size); - auto out_ref = std::make_unique(total_size); + auto out_ref = std::make_unique(total_size); auto vector_accumulate_wrapper = [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) { @@ -357,35 +369,34 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) } END_TIMER(VECADD_sync, oss.str().c_str(), measure); }; - + // // scalar add vec - FieldApiTest::random_samples(scalar_a.get(),(use_single_scalar? 1: batch_size)); - FieldApiTest::random_samples(in_b.get(),total_size); - + FieldApiTest::random_samples(scalar_a.get(), (use_single_scalar ? 1 : batch_size)); + FieldApiTest::random_samples(in_b.get(), total_size); + // reference if (!s_is_cuda_registered) { for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) { - uint64_t idx_b = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N; - out_ref[idx_b] = (use_single_scalar? scalar_a[0] : scalar_a[idx_in_batch]) + in_b[idx_b]; + uint64_t idx_b = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N; + out_ref[idx_b] = (use_single_scalar ? scalar_a[0] : scalar_a[idx_in_batch]) + in_b[idx_b]; } } } else { run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_add_vec, "scalar add vec", ITERS); } run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_add_vec, "scalar add vec", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); - // // scalar sub vec - FieldApiTest::random_samples(scalar_a.get(),(use_single_scalar? 1: batch_size)); - FieldApiTest::random_samples(in_b.get(),total_size); - + FieldApiTest::random_samples(scalar_a.get(), (use_single_scalar ? 1 : batch_size)); + FieldApiTest::random_samples(in_b.get(), total_size); + if (!s_is_cuda_registered) { for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) { - uint64_t idx_b = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N; - out_ref[idx_b] = (use_single_scalar? scalar_a[0] : scalar_a[idx_in_batch]) - in_b[idx_b]; + uint64_t idx_b = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N; + out_ref[idx_b] = (use_single_scalar ? scalar_a[0] : scalar_a[idx_in_batch]) - in_b[idx_b]; } } } else { @@ -393,24 +404,24 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) } run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_sub_vec, "scalar sub vec", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); // // scalar mul vec - FieldApiTest::random_samples(scalar_a.get(),(use_single_scalar? 1: batch_size)); - FieldApiTest::random_samples(in_b.get(),total_size); - + FieldApiTest::random_samples(scalar_a.get(), (use_single_scalar ? 1 : batch_size)); + FieldApiTest::random_samples(in_b.get(), total_size); + if (!s_is_cuda_registered) { for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) { - uint64_t idx_b = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N; - out_ref[idx_b] = (use_single_scalar? scalar_a[0] : scalar_a[idx_in_batch]) * in_b[idx_b]; + uint64_t idx_b = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N; + out_ref[idx_b] = (use_single_scalar ? scalar_a[0] : scalar_a[idx_in_batch]) * in_b[idx_b]; } } } else { run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_mul_vec, "scalar mul vec", ITERS); } run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_mul_vec, "scalar mul vec", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); } TYPED_TEST(FieldApiTest, matrixAPIsAsync) @@ -418,12 +429,15 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync) int seed = time(0); srand(seed); // ICICLE_LOG_DEBUG << "seed = " << seed; - const int R = 1 << (rand() % 8 + 2); // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2 - const int C = 1 << (rand() % 8 + 2); // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2 + const int R = + 1 << (rand() % 8 + 2); // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2 + const int C = + 1 << (rand() % 8 + 2); // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2 const int batch_size = 1 << (rand() % 4); const bool columns_batch = rand() % 2; const bool is_in_place = rand() % 2; - // ICICLE_LOG_DEBUG << "R = " << R << ", C = " << C << ", batch_size = " << batch_size << ", columns_batch = " << columns_batch << ", is_in_place = " << is_in_place; //TODO SHANIE - remove this + // ICICLE_LOG_DEBUG << "R = " << R << ", C = " << C << ", batch_size = " << batch_size << ", columns_batch = " << + // columns_batch << ", is_in_place = " << is_in_place; //TODO SHANIE - remove this const int total_size = R * C * batch_size; auto h_inout = std::make_unique(total_size); auto h_out_main = std::make_unique(total_size); @@ -490,36 +504,37 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync) // } // Option 3: Initialize the entire input array with random values - FieldApiTest::random_samples(h_inout.get(),total_size); + FieldApiTest::random_samples(h_inout.get(), total_size); // Reference implementation if (!s_is_cuda_registered) { const TypeParam* cur_mat_in = h_inout.get(); TypeParam* cur_mat_out = h_out_ref.get(); - uint32_t stride = columns_batch? batch_size : 1; + uint32_t stride = columns_batch ? batch_size : 1; const uint64_t total_elements_one_mat = static_cast(R) * C; for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { // Perform the matrix transpose for (uint32_t i = 0; i < R; ++i) { for (uint32_t j = 0; j < C; ++j) { - cur_mat_out[stride*(j * R + i)] = cur_mat_in[stride*(i * C + j)]; + cur_mat_out[stride * (j * R + i)] = cur_mat_in[stride * (i * C + j)]; } } cur_mat_in += (columns_batch ? 1 : total_elements_one_mat); cur_mat_out += (columns_batch ? 1 : total_elements_one_mat); } } else { - run(s_reference_target, (is_in_place? h_inout.get() : h_out_ref.get()), VERBOSE /*=measure*/, "transpose", ITERS); + run(s_reference_target, (is_in_place ? h_inout.get() : h_out_ref.get()), VERBOSE /*=measure*/, "transpose", ITERS); } - run(s_main_target, (is_in_place? h_inout.get() : h_out_main.get()), VERBOSE /*=measure*/, "transpose", ITERS); + run(s_main_target, (is_in_place ? h_inout.get() : h_out_main.get()), VERBOSE /*=measure*/, "transpose", ITERS); if (is_in_place) { ASSERT_EQ(0, memcmp(h_inout.get(), h_out_ref.get(), total_size * sizeof(TypeParam))); } else { - // std::cout << "h_out_main:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << h_out_main[i] << ", "; } std::cout <::random_samples(in_a.get(),total_size); - + FieldApiTest::random_samples(in_a.get(), total_size); // Reference implementation if (!s_is_cuda_registered) { uint64_t logn = 0; uint64_t temp = N; while (temp > 1) { - temp >>= 1; - logn++; + temp >>= 1; + logn++; } - //BIT REVERSE FUNCTION + // BIT REVERSE FUNCTION for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { for (uint64_t i = 0; i < N; i++) { int rev = 0; for (int j = 0; j < logn; ++j) { if (i & (1 << j)) { rev |= 1 << (logn - 1 - j); } } - if(columns_batch){ + if (columns_batch) { out_ref[idx_in_batch + batch_size * i] = in_a[idx_in_batch + batch_size * rev]; - // ICICLE_LOG_DEBUG << "out_ref[" << idx_in_batch + batch_size * i << "] = in_a[" << idx_in_batch + batch_size * rev << "]"; + // ICICLE_LOG_DEBUG << "out_ref[" << idx_in_batch + batch_size * i << "] = in_a[" << idx_in_batch + batch_size + // * rev << "]"; } else { out_ref[idx_in_batch * N + i] = in_a[idx_in_batch * N + rev]; // ICICLE_LOG_DEBUG << "out_ref[" << idx_in_batch * N + i << "] = in_a[" << idx_in_batch * N + rev << "]"; @@ -607,17 +621,17 @@ TYPED_TEST(FieldApiTest, bitReverse) } } } else { - run(s_reference_target, (is_in_place? in_a.get() : out_ref.get()), VERBOSE /*=measure*/, "bit-reverse", 1); + run(s_reference_target, (is_in_place ? in_a.get() : out_ref.get()), VERBOSE /*=measure*/, "bit-reverse", 1); } - run(s_main_target, (is_in_place? in_a.get() : out_main.get()), VERBOSE /*=measure*/, "bit-reverse", 1); + run(s_main_target, (is_in_place ? in_a.get() : out_main.get()), VERBOSE /*=measure*/, "bit-reverse", 1); if (is_in_place) { ASSERT_EQ(0, memcmp(in_a.get(), out_ref.get(), N * sizeof(TypeParam))); } else { - // std::cout << "out_main:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout <::random_samples(in_coeffs.get(), total_coeffs_size); FieldApiTest::random_samples(in_domain.get(), domain_size); - // Reference implementation - // TODO - Check in comperison with GPU implementation + // TODO - Check in comperison with GPU implementation run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "polynomial_eval", 1); if (s_is_cuda_registered) { run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "polynomial_eval", 1); - // std::cout << "out_main:\t["; for (int i = 0; i < total_coeffs_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout <(total_q_size); auto r_out_ref = std::make_unique(total_r_size); - auto run = [&](const std::string& dev_type, TypeParam* q_out, TypeParam* r_out, bool measure, const char* msg, int iters) { - Device dev = {dev_type, 0}; - icicle_set_device(dev); - auto config = default_vec_ops_config(); - config.batch_size = batch_size; - config.columns_batch = columns_batch; - + auto run = + [&](const std::string& dev_type, TypeParam* q_out, TypeParam* r_out, bool measure, const char* msg, int iters) { + Device dev = {dev_type, 0}; + icicle_set_device(dev); + auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; - std::ostringstream oss; - oss << dev_type << " " << msg; + std::ostringstream oss; + oss << dev_type << " " << msg; - START_TIMER(polynomialDivision) - for (int i = 0; i < iters; ++i) { - ICICLE_CHECK(polynomial_division(numerator.get(), numerator_deg, denumerator.get(), denumerator_deg , q_size, r_size, config, q_out, r_out)); - } - END_TIMER(polynomialDivision, oss.str().c_str(), measure); - }; + START_TIMER(polynomialDivision) + for (int i = 0; i < iters; ++i) { + ICICLE_CHECK(polynomial_division( + numerator.get(), numerator_deg, denumerator.get(), denumerator_deg, q_size, r_size, config, q_out, r_out)); + } + END_TIMER(polynomialDivision, oss.str().c_str(), measure); + }; // // Option 1: Initialize input vectors with random values // FieldApiTest::random_samples(numerator.get(), total_numerator_size); // FieldApiTest::random_samples(denumerator.get(), total_denumerator_size); - // // Reference implementation + // // Reference implementation // TODO - Check in comperison with GPU implementation or implement a general reference implementation // Option 2: Initialize the numerator and denumerator with chosen example // And the reference implementation for the example for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { - if (columns_batch){ + if (columns_batch) { // numerator = 3x^3+4x^2+5 - numerator[idx_in_batch + 0*batch_size] = TypeParam::from(5); - numerator[idx_in_batch + 1*batch_size] = TypeParam::from(0); - numerator[idx_in_batch + 2*batch_size] = TypeParam::from(4); - numerator[idx_in_batch + 3*batch_size] = TypeParam::from(3); + numerator[idx_in_batch + 0 * batch_size] = TypeParam::from(5); + numerator[idx_in_batch + 1 * batch_size] = TypeParam::from(0); + numerator[idx_in_batch + 2 * batch_size] = TypeParam::from(4); + numerator[idx_in_batch + 3 * batch_size] = TypeParam::from(3); // denumerator = x^2-1 - denumerator[idx_in_batch + 0*batch_size] = TypeParam::from(0) - TypeParam::from(1); - denumerator[idx_in_batch + 1*batch_size] = TypeParam::from(0); - denumerator[idx_in_batch + 2*batch_size] = TypeParam::from(1); + denumerator[idx_in_batch + 0 * batch_size] = TypeParam::from(0) - TypeParam::from(1); + denumerator[idx_in_batch + 1 * batch_size] = TypeParam::from(0); + denumerator[idx_in_batch + 2 * batch_size] = TypeParam::from(1); if (!s_is_cuda_registered) { // q_out_ref = 3x+4 - q_out_ref[idx_in_batch + 0*batch_size] = TypeParam::from(4); - q_out_ref[idx_in_batch + 1*batch_size] = TypeParam::from(3); + q_out_ref[idx_in_batch + 0 * batch_size] = TypeParam::from(4); + q_out_ref[idx_in_batch + 1 * batch_size] = TypeParam::from(3); // r_out_ref = 3x+9 - r_out_ref[idx_in_batch + 0*batch_size] = TypeParam::from(9); - r_out_ref[idx_in_batch + 1*batch_size] = TypeParam::from(3); + r_out_ref[idx_in_batch + 0 * batch_size] = TypeParam::from(9); + r_out_ref[idx_in_batch + 1 * batch_size] = TypeParam::from(3); } } else { // numerator = 3x^3+4x^2+5 - numerator[idx_in_batch * (numerator_deg+1) + 0] = TypeParam::from(5); - numerator[idx_in_batch * (numerator_deg+1) + 1] = TypeParam::from(0); - numerator[idx_in_batch * (numerator_deg+1) + 2] = TypeParam::from(4); - numerator[idx_in_batch * (numerator_deg+1) + 3] = TypeParam::from(3); + numerator[idx_in_batch * (numerator_deg + 1) + 0] = TypeParam::from(5); + numerator[idx_in_batch * (numerator_deg + 1) + 1] = TypeParam::from(0); + numerator[idx_in_batch * (numerator_deg + 1) + 2] = TypeParam::from(4); + numerator[idx_in_batch * (numerator_deg + 1) + 3] = TypeParam::from(3); // denumerator = x^2-1 - denumerator[idx_in_batch * (denumerator_deg+1) + 0] = TypeParam::from(0) - TypeParam::from(1); - denumerator[idx_in_batch * (denumerator_deg+1) + 1] = TypeParam::from(0); - denumerator[idx_in_batch * (denumerator_deg+1) + 2] = TypeParam::from(1); + denumerator[idx_in_batch * (denumerator_deg + 1) + 0] = TypeParam::from(0) - TypeParam::from(1); + denumerator[idx_in_batch * (denumerator_deg + 1) + 1] = TypeParam::from(0); + denumerator[idx_in_batch * (denumerator_deg + 1) + 2] = TypeParam::from(1); if (!s_is_cuda_registered) { // q_out_ref = 3x+4 q_out_ref[idx_in_batch * q_size + 0] = TypeParam::from(4); @@ -903,10 +918,13 @@ TYPED_TEST(FieldApiTest, polynomialDivision) if (s_is_cuda_registered) { run(s_reference_target, q_out_ref.get(), r_out_ref.get(), VERBOSE /*=measure*/, "polynomial_division", 1); } - // std::cout << "numerator:\t["; for (int i = 0; i < total_numerator_size-1; i++) { std::cout << numerator[i] << ", "; } std::cout < Date: Mon, 21 Oct 2024 14:33:56 +0300 Subject: [PATCH 11/43] vectorVectorOps passes --- icicle/tests/test_field_api.cpp | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 50d4b0d8f..a717faf33 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -28,7 +28,8 @@ static bool VERBOSE = true; static int ITERS = 1; static inline std::string s_main_target; static inline std::string s_reference_target; -static const bool s_is_cuda_registered = is_device_registered("CUDA"); +// static const bool s_is_cuda_registered = is_device_registered("CUDA"); +bool s_is_cuda_registered; template class FieldApiTest : public ::testing::Test @@ -42,6 +43,7 @@ class FieldApiTest : public ::testing::Test #endif icicle_load_backend_from_env_or_default(); + s_is_cuda_registered = is_device_registered("CUDA"); if (!s_is_cuda_registered) { ICICLE_LOG_ERROR << "CUDA device not found. Testing CPU vs reference (on cpu)"; } s_main_target = s_is_cuda_registered ? "CUDA" : "CPU"; s_reference_target = "CPU"; @@ -93,13 +95,18 @@ TYPED_TEST(FieldApiTest, vectorVectorOps) srand(seed); ICICLE_LOG_DEBUG << "seed = " << seed; const uint64_t N = 1 << (rand() % 15 + 3); + // const uint64_t N = 1 << (3); const int batch_size = 1 << (rand() % 5); + // const int batch_size = 2; const bool columns_batch = rand() % 2; const int total_size = N * batch_size; auto in_a = std::make_unique(total_size); auto in_b = std::make_unique(total_size); auto out_main = std::make_unique(total_size); auto out_ref = std::make_unique(total_size); + ICICLE_LOG_DEBUG << "N = " << N; + ICICLE_LOG_DEBUG << "batch_size = " << batch_size; + ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; auto vector_accumulate_wrapper = [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) { @@ -152,14 +159,19 @@ TYPED_TEST(FieldApiTest, vectorVectorOps) // // accumulate FieldApiTest::random_samples(in_a.get(), total_size); FieldApiTest::random_samples(in_b.get(), total_size); - if (!s_is_cuda_registered) { + // if (!s_is_cuda_registered) { for (int i = 0; i < total_size; i++) { out_ref[i] = in_a[i] + in_b[i]; } - } else { - run(s_reference_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS); - } + // } else { + // run(s_reference_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS); + // } run(s_main_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS); + + // for (int i = 0; i < total_size; i++) { + // ICICLE_LOG_DEBUG << i << ", " << in_a[i] << ", " << in_b[i] << ", " << out_ref[i]; + // } + ASSERT_EQ(0, memcmp(in_a.get(), out_ref.get(), total_size * sizeof(TypeParam))); // // sub From 0c6bc9aaffde999272f2306562bfafe8e2f0bef3 Mon Sep 17 00:00:00 2001 From: hadaringonyama Date: Tue, 22 Oct 2024 15:14:43 +0300 Subject: [PATCH 12/43] mont + scalars passing --- icicle/tests/test_field_api.cpp | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index a717faf33..8f607e4d4 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -348,16 +348,24 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) { int seed = time(0); srand(seed); - // ICICLE_LOG_DEBUG << "seed = " << seed; + ICICLE_LOG_DEBUG << "seed = " << seed; const uint64_t N = 1 << (rand() % 15 + 3); + // const uint64_t N = 1 << 3; const int batch_size = 1 << (rand() % 5); + // const int batch_size = 2; const bool columns_batch = rand() % 2; + // const bool columns_batch = 0; const bool use_single_scalar = rand() % 2; + // const bool use_single_scalar = 1; const int total_size = N * batch_size; auto scalar_a = std::make_unique(use_single_scalar ? 1 : batch_size); auto in_b = std::make_unique(total_size); auto out_main = std::make_unique(total_size); auto out_ref = std::make_unique(total_size); + ICICLE_LOG_DEBUG << "N = " << N; + ICICLE_LOG_DEBUG << "batch_size = " << batch_size; + ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; + ICICLE_LOG_DEBUG << "use_single_scalar = " << use_single_scalar; auto vector_accumulate_wrapper = [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) { @@ -398,9 +406,16 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_add_vec, "scalar add vec", ITERS); } run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_add_vec, "scalar add vec", ITERS); + + + // ICICLE_LOG_DEBUG << scalar_a[0] << ", "; + // for (int i = 0; i < total_size; i++) { + // ICICLE_LOG_DEBUG << i << ", " << in_b[i] << ", " << out_main[i] << ", " << out_ref[i]; + // } + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); - // // scalar sub vec + // scalar sub vec FieldApiTest::random_samples(scalar_a.get(), (use_single_scalar ? 1 : batch_size)); FieldApiTest::random_samples(in_b.get(), total_size); From 32e262b2f34c9870692674b7fee0c8abc9c18984 Mon Sep 17 00:00:00 2001 From: hadaringonyama Date: Wed, 23 Oct 2024 18:20:46 +0300 Subject: [PATCH 13/43] bitrev passes --- icicle/tests/test_field_api.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 8f607e4d4..96f09ed82 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -569,17 +569,17 @@ TYPED_TEST(FieldApiTest, bitReverse) { int seed = time(0); srand(seed); - // ICICLE_LOG_DEBUG << "seed = " << seed; + ICICLE_LOG_DEBUG << "seed = " << seed; const uint64_t N = 1 << (rand() % 15 + 3); const int batch_size = 1 << (rand() % 5); const bool columns_batch = rand() % 2; const bool is_in_place = rand() % 2; const int total_size = N * batch_size; - // const uint64_t N = 1 << (2); + // const uint64_t N = 1 << (3); // const int batch_size = 1 << (1); - // const bool columns_batch = true; - // const bool is_in_place = true; + // const bool columns_batch = 1; + // const bool is_in_place = 0; // const int total_size = N * batch_size; auto in_a = std::make_unique(total_size); @@ -623,7 +623,7 @@ TYPED_TEST(FieldApiTest, bitReverse) FieldApiTest::random_samples(in_a.get(), total_size); // Reference implementation - if (!s_is_cuda_registered) { + if (!s_is_cuda_registered || is_in_place) { uint64_t logn = 0; uint64_t temp = N; while (temp > 1) { @@ -652,6 +652,10 @@ TYPED_TEST(FieldApiTest, bitReverse) } run(s_main_target, (is_in_place ? in_a.get() : out_main.get()), VERBOSE /*=measure*/, "bit-reverse", 1); + // for (int i = 0; i < total_size; i++) { + // ICICLE_LOG_DEBUG << i << ", " << in_a[i] << ", " << out_main[i] << ", " << out_ref[i]; + // } + if (is_in_place) { ASSERT_EQ(0, memcmp(in_a.get(), out_ref.get(), N * sizeof(TypeParam))); } else { From e8e1799f63f5f978a913e533a62ce11ecbc981c2 Mon Sep 17 00:00:00 2001 From: hadaringonyama Date: Mon, 28 Oct 2024 13:20:43 +0200 Subject: [PATCH 14/43] slice passes --- icicle/tests/test_field_api.cpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 96f09ed82..bacb44540 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -670,13 +670,25 @@ TYPED_TEST(FieldApiTest, Slice) { int seed = time(0); srand(seed); - // ICICLE_LOG_DEBUG << "seed = " << seed; + ICICLE_LOG_DEBUG << "seed = " << seed; const uint64_t size_in = 1 << (rand() % 15 + 5); const uint64_t offset = rand() % 15; const uint64_t stride = rand() % 4 + 1; const uint64_t size_out = rand() % (((size_in - offset) / stride) - 1) + 1; const int batch_size = 1 << (rand() % 5); const bool columns_batch = rand() % 2; + + // const uint64_t size_in = 1 << (20); + // const uint64_t offset = 97; + // const uint64_t stride = 6; + // const uint64_t size_out = (((size_in - offset) / stride) - 1) - 100; + + // ICICLE_LOG_DEBUG << size_in <<", "<< offset<<", "< Date: Tue, 29 Oct 2024 12:24:07 +0200 Subject: [PATCH 15/43] slice passes --- icicle/tests/test_field_api.cpp | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index bacb44540..cc2927b7f 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -462,7 +462,14 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync) 1 << (rand() % 8 + 2); // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2 const int batch_size = 1 << (rand() % 4); const bool columns_batch = rand() % 2; - const bool is_in_place = rand() % 2; + const bool is_in_place = s_is_cuda_registered? 0 : rand() % 2; //TODO - fix inplace (Hadar: I'm not sure we should support it) + + // const int R = 4; // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2 + // const int C = 3; + // const int batch_size = 1 << (1); + // const bool columns_batch = 1; + // const bool is_in_place = 1; + // ICICLE_LOG_DEBUG << "R = " << R << ", C = " << C << ", batch_size = " << batch_size << ", columns_batch = " << // columns_batch << ", is_in_place = " << is_in_place; //TODO SHANIE - remove this const int total_size = R * C * batch_size; @@ -488,9 +495,9 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync) TypeParam *d_in, *d_out; if (!device_props.using_host_memory) { icicle_create_stream(&config.stream); - icicle_malloc_async((void**)&d_in, R * C * sizeof(TypeParam), config.stream); - icicle_malloc_async((void**)&d_out, R * C * sizeof(TypeParam), config.stream); - icicle_copy_to_device_async(d_in, h_inout.get(), R * C * sizeof(TypeParam), config.stream); + icicle_malloc_async((void**)&d_in, total_size * sizeof(TypeParam), config.stream); + icicle_malloc_async((void**)&d_out, total_size * sizeof(TypeParam), config.stream); + icicle_copy_to_device_async(d_in, h_inout.get(), total_size * sizeof(TypeParam), config.stream); config.is_a_on_device = true; config.is_result_on_device = true; @@ -507,7 +514,7 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync) END_TIMER(TRANSPOSE, oss.str().c_str(), measure); if (!device_props.using_host_memory) { - icicle_copy_to_host_async(h_out, d_out, R * C * sizeof(TypeParam), config.stream); + icicle_copy_to_host_async(h_out, d_out, total_size * sizeof(TypeParam), config.stream); icicle_stream_synchronize(config.stream); icicle_free_async(d_in, config.stream); icicle_free_async(d_out, config.stream); @@ -554,6 +561,12 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync) } run(s_main_target, (is_in_place ? h_inout.get() : h_out_main.get()), VERBOSE /*=measure*/, "transpose", ITERS); + + // ICICLE_LOG_DEBUG << scalar_a[0] << ", "; + // for (int i = 0; i < total_size; i++) { + // ICICLE_LOG_DEBUG << i << ", " << h_inout[i] << ", " << h_out_main[i] << ", " << h_out_ref[i]; + // } + if (is_in_place) { ASSERT_EQ(0, memcmp(h_inout.get(), h_out_ref.get(), total_size * sizeof(TypeParam))); } else { From 0c609bf10e192d50e39637cbae165b22ae441072 Mon Sep 17 00:00:00 2001 From: hadaringonyama Date: Tue, 29 Oct 2024 15:48:02 +0200 Subject: [PATCH 16/43] reduction passes --- icicle/tests/test_field_api.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index cc2927b7f..a1bffc7b8 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -279,6 +279,12 @@ TYPED_TEST(FieldApiTest, VectorReduceOps) const int batch_size = 1 << (rand() % 5); const bool columns_batch = rand() % 2; const int total_size = N * batch_size; + + // const uint64_t N = 1 << (rand() % 15 + 3); + // const int batch_size = 1 << 3; + // const bool columns_batch = 1; + // const int total_size = N * batch_size; + auto in_a = std::make_unique(total_size); auto out_main = std::make_unique(batch_size); auto out_ref = std::make_unique(batch_size); From dca2e5bf21b3e95a579f46a2acf58b730179409b Mon Sep 17 00:00:00 2001 From: hadaringonyama Date: Wed, 30 Oct 2024 10:28:18 +0200 Subject: [PATCH 17/43] fix scalar columns batch --- icicle/tests/test_field_api.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index a1bffc7b8..00985a25d 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -274,14 +274,14 @@ TYPED_TEST(FieldApiTest, VectorReduceOps) { int seed = time(0); srand(seed); - // ICICLE_LOG_DEBUG << "seed = " << seed; + ICICLE_LOG_DEBUG << "seed = " << seed; const uint64_t N = 1 << (rand() % 15 + 3); const int batch_size = 1 << (rand() % 5); const bool columns_batch = rand() % 2; const int total_size = N * batch_size; - // const uint64_t N = 1 << (rand() % 15 + 3); - // const int batch_size = 1 << 3; + // const uint64_t N = 1 << (20); + // const int batch_size = 1 << 4; // const bool columns_batch = 1; // const int total_size = N * batch_size; @@ -356,13 +356,15 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) srand(seed); ICICLE_LOG_DEBUG << "seed = " << seed; const uint64_t N = 1 << (rand() % 15 + 3); - // const uint64_t N = 1 << 3; const int batch_size = 1 << (rand() % 5); - // const int batch_size = 2; const bool columns_batch = rand() % 2; - // const bool columns_batch = 0; const bool use_single_scalar = rand() % 2; - // const bool use_single_scalar = 1; + + // const uint64_t N = 1 << (4); + // const int batch_size = 7; + // const bool columns_batch = 1; + // const bool use_single_scalar = 0; + const int total_size = N * batch_size; auto scalar_a = std::make_unique(use_single_scalar ? 1 : batch_size); auto in_b = std::make_unique(total_size); @@ -415,6 +417,7 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) // ICICLE_LOG_DEBUG << scalar_a[0] << ", "; + // ICICLE_LOG_DEBUG << scalar_a[1] << ", "; // for (int i = 0; i < total_size; i++) { // ICICLE_LOG_DEBUG << i << ", " << in_b[i] << ", " << out_main[i] << ", " << out_ref[i]; // } From 0728a069352f4bd8eca8d5a00bb6554a17dd54ba Mon Sep 17 00:00:00 2001 From: hadaringonyama Date: Wed, 30 Oct 2024 12:20:25 +0200 Subject: [PATCH 18/43] remove same scalar bool --- icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 20 ++++------ .../include/icicle/backend/vec_ops_backend.h | 1 - .../default_backend/default_poly_backend.h | 8 ++-- icicle/include/icicle/vec_ops.h | 21 ++-------- icicle/src/vec_ops.cpp | 18 +++------ icicle/tests/test_field_api.cpp | 39 ++++++++++--------- 6 files changed, 41 insertions(+), 66 deletions(-) diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index 74678fc83..7133bec8c 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -405,22 +405,21 @@ eIcicleError cpu_scalar_vector_op( const T* scalar_a, const T* vec_b, uint64_t size, - bool use_single_scalar, const VecOpsConfig& config, T* output) { TasksManager> task_manager(get_nof_workers(config) - 1); - const uint64_t total_nof_operations = use_single_scalar ? size * config.batch_size : size; - const uint32_t stride = (!use_single_scalar && config.columns_batch) ? config.batch_size : 1; - for (uint32_t idx_in_batch = 0; idx_in_batch < (use_single_scalar ? 1 : config.batch_size); idx_in_batch++) { + const uint64_t total_nof_operations = size; + const uint32_t stride = config.columns_batch ? config.batch_size : 1; + for (uint32_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) { VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); task_p->send_2ops_task( op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), scalar_a + idx_in_batch, - (!use_single_scalar && config.columns_batch) ? vec_b + idx_in_batch + i * config.batch_size + config.columns_batch ? vec_b + idx_in_batch + i * config.batch_size : vec_b + idx_in_batch * size + i, stride, - (!use_single_scalar && config.columns_batch) ? output + idx_in_batch + i * config.batch_size + config.columns_batch ? output + idx_in_batch + i * config.batch_size : output + idx_in_batch * size + i); } } @@ -595,11 +594,10 @@ eIcicleError cpu_scalar_add( const T* scalar_a, const T* vec_b, uint64_t size, - bool use_single_scalar, const VecOpsConfig& config, T* output) { - return cpu_scalar_vector_op(VecOperation::SCALAR_ADD_VEC, scalar_a, vec_b, size, use_single_scalar, config, output); + return cpu_scalar_vector_op(VecOperation::SCALAR_ADD_VEC, scalar_a, vec_b, size, config, output); } REGISTER_SCALAR_ADD_VEC_BACKEND("CPU", cpu_scalar_add); @@ -611,11 +609,10 @@ eIcicleError cpu_scalar_sub( const T* scalar_a, const T* vec_b, uint64_t size, - bool use_single_scalar, const VecOpsConfig& config, T* output) { - return cpu_scalar_vector_op(VecOperation::SCALAR_SUB_VEC, scalar_a, vec_b, size, use_single_scalar, config, output); + return cpu_scalar_vector_op(VecOperation::SCALAR_SUB_VEC, scalar_a, vec_b, size, config, output); } REGISTER_SCALAR_SUB_VEC_BACKEND("CPU", cpu_scalar_sub); @@ -627,11 +624,10 @@ eIcicleError cpu_scalar_mul( const T* scalar_a, const T* vec_b, uint64_t size, - bool use_single_scalar, const VecOpsConfig& config, T* output) { - return cpu_scalar_vector_op(VecOperation::SCALAR_MUL_VEC, scalar_a, vec_b, size, use_single_scalar, config, output); + return cpu_scalar_vector_op(VecOperation::SCALAR_MUL_VEC, scalar_a, vec_b, size, config, output); } REGISTER_SCALAR_MUL_VEC_BACKEND("CPU", cpu_scalar_mul); diff --git a/icicle/include/icicle/backend/vec_ops_backend.h b/icicle/include/icicle/backend/vec_ops_backend.h index 58909e1f4..1adfe89f8 100644 --- a/icicle/include/icicle/backend/vec_ops_backend.h +++ b/icicle/include/icicle/backend/vec_ops_backend.h @@ -34,7 +34,6 @@ namespace icicle { const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, - bool use_single_scalar, const VecOpsConfig& config, scalar_t* output)>; diff --git a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h index c7e53b218..12468cb53 100644 --- a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h +++ b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h @@ -126,7 +126,7 @@ namespace icicle { C zero = C::zero(); config.is_a_on_device = false; ICICLE_CHECK( - scalar_sub_vec(&zero, b_mem_p + min_op_size, b->get_nof_elements() - min_op_size, true, config, res_mem_p)); + scalar_sub_vec(&zero, b_mem_p + min_op_size, b->get_nof_elements() - min_op_size, config, res_mem_p)); } } @@ -173,7 +173,7 @@ namespace icicle { config.is_result_on_device = true; config.is_async = true; config.stream = m_stream; - icicle::scalar_mul_vec(&scalar, p_elements_p, N, true, config, out_evals_p); + icicle::scalar_mul_vec(&scalar, p_elements_p, N, config, out_evals_p); } void multiply_with_padding(PolyContext c, PolyContext a, PolyContext b) @@ -409,7 +409,7 @@ namespace icicle { config.is_async = true; config.stream = m_stream; icicle::scalar_mul_vec( - &v_coset_eval, numerator_evals_reversed_p + N /*second half is the reversed coset*/, N, true, config, + &v_coset_eval, numerator_evals_reversed_p + N /*second half is the reversed coset*/, N, config, out_evals_reversed_p); // INTT back from reversed evals on coset to coeffs @@ -450,7 +450,7 @@ namespace icicle { config.is_result_on_device = true; config.is_async = true; config.stream = m_stream; - icicle::scalar_mul_vec(&v_coset_eval, out_evals_reversed_p, N, true, config, out_evals_reversed_p); + icicle::scalar_mul_vec(&v_coset_eval, out_evals_reversed_p, N, config, out_evals_reversed_p); // (3) INTT back from coset to coeffs ntt_config.are_inputs_on_device = true; diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h index 524cbcdc5..132d6cb69 100644 --- a/icicle/include/icicle/vec_ops.h +++ b/icicle/include/icicle/vec_ops.h @@ -220,17 +220,12 @@ namespace icicle { * * @tparam T Type of the elements in the vector and the scalar. * @param scalar_a Pointer to the input scalar(s). - * - If `use_single_scalar` is `true`, this should point to a single scalar value. - * - If `use_single_scalar` is `false`, this should point to an array of scalars with length - * `config.batch_size`. * @param vec_b Pointer to the input vector(s). * - If `config.batch_size > 1`, this should be a concatenated array of vectors. * - The layout depends on `config.columns_batch`: * - If `false`, vectors are stored contiguously. * - If `true`, vectors are stored as columns in a 2D array. * @param size Number of elements in a vector. - * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of - * scalars (`false`). * @param config Configuration for the operation. * @param output Pointer to the output vector(s) where the results will be stored. * @return eIcicleError Error code indicating success or failure. @@ -238,24 +233,19 @@ namespace icicle { */ template eIcicleError scalar_add_vec( - const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output); + const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output); /** * @brief Subtracts each element of a vector from a scalar, elementwise (res[i]=scalar-vec[i]). * * @tparam T Type of the elements in the vector and the scalar. * @param scalar_a Pointer to Input scalar(s). - * - If `use_single_scalar` is `true`, this should point to a single scalar value. - * - If `use_single_scalar` is `false`, this should point to an array of scalars with length - * `config.batch_size`. * @param vec_b Pointer to the input vector(s). * - If `config.batch_size > 1`, this should be a concatenated array of vectors. * - The layout depends on `config.columns_batch`: * - If `false`, vectors are stored contiguously. * - If `true`, vectors are stored as columns in a 2D array. * @param size Number of elements in a vector. - * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of - * scalars (`false`). * @param config Configuration for the operation. * @param output Pointer to the output vector(s) where the results will be stored. * @return eIcicleError Error code indicating success or failure. @@ -263,31 +253,26 @@ namespace icicle { */ template eIcicleError scalar_sub_vec( - const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output); + const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output); /** * @brief Multiplies each element of a vector by a scalar. * * @tparam T Type of the elements in the vector and the scalar. * @param scalar_a Pointer to Input scalar(s). - * - If `use_single_scalar` is `true`, this should point to a single scalar value. - * - If `use_single_scalar` is `false`, this should point to an array of scalars with length - * `config.batch_size`. * @param vec_b Pointer to the input vector(s). * - If `config.batch_size > 1`, this should be a concatenated array of vectors. * - The layout depends on `config.columns_batch`: * - If `false`, vectors are stored contiguously. * - If `true`, vectors are stored as columns in a 2D array. * @param size Number of elements in a vector. - * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of - * scalars (`false`). * @param config Configuration for the operation. * @param output Pointer to the output vector(s) where the results will be stored. * @return eIcicleError Error code indicating success or failure. */ template eIcicleError scalar_mul_vec( - const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output); + const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output); // Matrix operations diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp index c97fe3e1f..5eb4ea49e 100644 --- a/icicle/src/vec_ops.cpp +++ b/icicle/src/vec_ops.cpp @@ -187,11 +187,10 @@ namespace icicle { const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, - bool use_single_scalar, const VecOpsConfig* config, scalar_t* output) { - return ScalarAddDispatcher::execute(scalar_a, vec_b, size, use_single_scalar, *config, output); + return ScalarAddDispatcher::execute(scalar_a, vec_b, size, *config, output); } template <> @@ -199,11 +198,10 @@ namespace icicle { const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, - bool use_single_scalar, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, scalar_add_vec)(scalar_a, vec_b, size, use_single_scalar, &config, output); + return CONCAT_EXPAND(FIELD, scalar_add_vec)(scalar_a, vec_b, size, &config, output); } /*********************************** (Scalar - Vector) ELEMENT WISE ***********************************/ @@ -213,11 +211,10 @@ namespace icicle { const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, - bool use_single_scalar, const VecOpsConfig* config, scalar_t* output) { - return ScalarSubDispatcher::execute(scalar_a, vec_b, size, use_single_scalar, *config, output); + return ScalarSubDispatcher::execute(scalar_a, vec_b, size, *config, output); } template <> @@ -225,11 +222,10 @@ namespace icicle { const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, - bool use_single_scalar, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, scalar_sub_vec)(scalar_a, vec_b, size, use_single_scalar, &config, output); + return CONCAT_EXPAND(FIELD, scalar_sub_vec)(scalar_a, vec_b, size, &config, output); } /*********************************** MUL BY SCALAR ***********************************/ ICICLE_DISPATCHER_INST(ScalarMulDispatcher, scalar_mul_vec, scalarVectorOpImpl); @@ -238,11 +234,10 @@ namespace icicle { const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, - bool use_single_scalar, const VecOpsConfig* config, scalar_t* output) { - return ScalarMulDispatcher::execute(scalar_a, vec_b, size, use_single_scalar, *config, output); + return ScalarMulDispatcher::execute(scalar_a, vec_b, size, *config, output); } template <> @@ -250,11 +245,10 @@ namespace icicle { const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, - bool use_single_scalar, const VecOpsConfig& config, scalar_t* output) { - return CONCAT_EXPAND(FIELD, scalar_mul_vec)(scalar_a, vec_b, size, use_single_scalar, &config, output); + return CONCAT_EXPAND(FIELD, scalar_mul_vec)(scalar_a, vec_b, size, &config, output); } /*********************************** CONVERT MONTGOMERY ***********************************/ diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 00985a25d..4c079a80c 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -358,22 +358,19 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) const uint64_t N = 1 << (rand() % 15 + 3); const int batch_size = 1 << (rand() % 5); const bool columns_batch = rand() % 2; - const bool use_single_scalar = rand() % 2; // const uint64_t N = 1 << (4); // const int batch_size = 7; // const bool columns_batch = 1; - // const bool use_single_scalar = 0; const int total_size = N * batch_size; - auto scalar_a = std::make_unique(use_single_scalar ? 1 : batch_size); + auto scalar_a = std::make_unique(batch_size); auto in_b = std::make_unique(total_size); auto out_main = std::make_unique(total_size); auto out_ref = std::make_unique(total_size); ICICLE_LOG_DEBUG << "N = " << N; ICICLE_LOG_DEBUG << "batch_size = " << batch_size; ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; - ICICLE_LOG_DEBUG << "use_single_scalar = " << use_single_scalar; auto vector_accumulate_wrapper = [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) { @@ -393,13 +390,13 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) START_TIMER(VECADD_sync) for (int i = 0; i < iters; ++i) { - ICICLE_CHECK(vec_op_func(scalar_a.get(), in_b.get(), N, use_single_scalar, config, out)); + ICICLE_CHECK(vec_op_func(scalar_a.get(), in_b.get(), N, config, out)); } END_TIMER(VECADD_sync, oss.str().c_str(), measure); }; // // scalar add vec - FieldApiTest::random_samples(scalar_a.get(), (use_single_scalar ? 1 : batch_size)); + FieldApiTest::random_samples(scalar_a.get(), batch_size); FieldApiTest::random_samples(in_b.get(), total_size); // reference @@ -407,7 +404,7 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) { uint64_t idx_b = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N; - out_ref[idx_b] = (use_single_scalar ? scalar_a[0] : scalar_a[idx_in_batch]) + in_b[idx_b]; + out_ref[idx_b] = (scalar_a[idx_in_batch]) + in_b[idx_b]; } } } else { @@ -425,14 +422,14 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); // scalar sub vec - FieldApiTest::random_samples(scalar_a.get(), (use_single_scalar ? 1 : batch_size)); + FieldApiTest::random_samples(scalar_a.get(), batch_size); FieldApiTest::random_samples(in_b.get(), total_size); if (!s_is_cuda_registered) { for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) { uint64_t idx_b = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N; - out_ref[idx_b] = (use_single_scalar ? scalar_a[0] : scalar_a[idx_in_batch]) - in_b[idx_b]; + out_ref[idx_b] = (scalar_a[idx_in_batch]) - in_b[idx_b]; } } } else { @@ -443,14 +440,14 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); // // scalar mul vec - FieldApiTest::random_samples(scalar_a.get(), (use_single_scalar ? 1 : batch_size)); + FieldApiTest::random_samples(scalar_a.get(), batch_size); FieldApiTest::random_samples(in_b.get(), total_size); if (!s_is_cuda_registered) { for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) { uint64_t idx_b = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N; - out_ref[idx_b] = (use_single_scalar ? scalar_a[0] : scalar_a[idx_in_batch]) * in_b[idx_b]; + out_ref[idx_b] = (scalar_a[idx_in_batch]) * in_b[idx_b]; } } } else { @@ -788,12 +785,12 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx) int seed = time(0); srand(seed); // ICICLE_LOG_DEBUG << "seed = " << seed; - const uint64_t N = 1 << (rand() % 15 + 3); - const int batch_size = 1 << (rand() % 5); - const bool columns_batch = rand() % 2; - // const uint64_t N = 1 << (3); - // const int batch_size = 1 << (1); - // const bool columns_batch = true; + // const uint64_t N = 1 << (rand() % 15 + 3); + // const int batch_size = 1 << (rand() % 5); + // const bool columns_batch = rand() % 2; + const uint64_t N = 1 << (8); + const int batch_size = 1 << (3); + const bool columns_batch = 1; const int total_size = N * batch_size; auto in_a = std::make_unique(total_size); @@ -819,7 +816,7 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx) // Initialize each entire vector with 1 at a random index. The highest non-zero index is the index with 1 for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { - if (!s_is_cuda_registered) { out_ref[idx_in_batch] = rand() % N; } // highest_non_zero_idx + if (!s_is_cuda_registered) { out_ref[idx_in_batch] = static_cast(rand() % N); } // highest_non_zero_idx for (uint32_t i = 0; i < N; i++) { if (columns_batch) { in_a[idx_in_batch + batch_size * i] = TypeParam::from(i == out_ref[idx_in_batch] ? 1 : 0); @@ -833,7 +830,11 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx) // std::cout << "out_main:\t["; for (int i = 0; i < batch_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout // < Date: Wed, 30 Oct 2024 13:24:36 +0200 Subject: [PATCH 19/43] fix API --- icicle/include/icicle/vec_ops.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h index 132d6cb69..291462d6b 100644 --- a/icicle/include/icicle/vec_ops.h +++ b/icicle/include/icicle/vec_ops.h @@ -417,8 +417,10 @@ namespace icicle { eIcicleError polynomial_division( const T* numerator, int64_t numerator_deg, + uint64_t numerator_size, const T* denumerator, int64_t denumerator_deg, + uint64_t denumerator_size, uint64_t q_size, uint64_t r_size, const VecOpsConfig& config, From 2fd1facf999e814ce2a043120ac498e1e66b081b Mon Sep 17 00:00:00 2001 From: hadaringonyama Date: Wed, 30 Oct 2024 13:40:19 +0200 Subject: [PATCH 20/43] fix API --- icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 2 ++ icicle/src/vec_ops.cpp | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index 7133bec8c..f27ab5600 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -910,8 +910,10 @@ eIcicleError cpu_poly_divide( const Device& device, const T* numerator, int64_t numerator_deg, + uint64_t numerator_size, const T* denumerator, int64_t denumerator_deg, + uint64_t denumerator_size, uint64_t q_size, uint64_t r_size, const VecOpsConfig& config, diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp index 5eb4ea49e..c722c595a 100644 --- a/icicle/src/vec_ops.cpp +++ b/icicle/src/vec_ops.cpp @@ -425,8 +425,10 @@ namespace icicle { extern "C" eIcicleError CONCAT_EXPAND(FIELD, poly_division)( const scalar_t* numerator, int64_t numerator_deg, + uint64_t numerator_size, const scalar_t* denumerator, int64_t denumerator_deg, + uint64_t denumerator_size, uint64_t q_size, uint64_t r_size, const VecOpsConfig* config, @@ -434,15 +436,17 @@ namespace icicle { scalar_t* r_out /*OUT*/) { return ScalarPolyDivDispatcher::execute( - numerator, numerator_deg, denumerator, denumerator_deg, q_size, r_size, *config, q_out, r_out); + numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, *config, q_out, r_out); } template <> eIcicleError polynomial_division( const scalar_t* numerator, int64_t numerator_deg, + uint64_t numerator_size, const scalar_t* denumerator, int64_t denumerator_deg, + uint64_t denumerator_size, uint64_t q_size, uint64_t r_size, const VecOpsConfig& config, @@ -450,7 +454,7 @@ namespace icicle { scalar_t* r_out /*OUT*/) { return CONCAT_EXPAND(FIELD, poly_division)( - numerator, numerator_deg, denumerator, denumerator_deg, q_size, r_size, &config, q_out, r_out); + numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, &config, q_out, r_out); } } // namespace icicle \ No newline at end of file From 1bd7c0501867c1f49a0c55753fe46199e3b758c8 Mon Sep 17 00:00:00 2001 From: hadaringonyama Date: Wed, 30 Oct 2024 23:52:31 +0200 Subject: [PATCH 21/43] non zero passes --- .../include/icicle/backend/vec_ops_backend.h | 2 ++ .../default_backend/default_poly_backend.h | 2 +- icicle/include/icicle/vec_ops.h | 10 ++++++++ icicle/src/vec_ops.cpp | 6 ++--- icicle/tests/test_field_api.cpp | 24 +++++++++---------- 5 files changed, 28 insertions(+), 16 deletions(-) diff --git a/icicle/include/icicle/backend/vec_ops_backend.h b/icicle/include/icicle/backend/vec_ops_backend.h index 1adfe89f8..04f7ed73f 100644 --- a/icicle/include/icicle/backend/vec_ops_backend.h +++ b/icicle/include/icicle/backend/vec_ops_backend.h @@ -74,8 +74,10 @@ namespace icicle { const Device& device, const scalar_t* numerator, int64_t numerator_deg, + uint64_t numerator_size, const scalar_t* denumerator, int64_t denumerator_deg, + uint64_t denumerator_size, uint64_t q_size, uint64_t r_size, const VecOpsConfig& config, diff --git a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h index 12468cb53..a42c87317 100644 --- a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h +++ b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h @@ -278,7 +278,7 @@ namespace icicle { config.is_result_on_device = true; ICICLE_CHECK(icicle::polynomial_division( - a_coeffs, deg_a, b_coeffs, deg_b, deg_a - deg_b + 1, a_N, config, Q_coeffs, R_coeffs)); + a_coeffs, deg_a, a_N, b_coeffs, deg_b, b_N, deg_a - deg_b + 1, a_N, config, Q_coeffs, R_coeffs)); } void quotient(PolyContext Q, PolyContext op_a, PolyContext op_b) override diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h index 291462d6b..bf5eab324 100644 --- a/icicle/include/icicle/vec_ops.h +++ b/icicle/include/icicle/vec_ops.h @@ -342,6 +342,16 @@ namespace icicle { const VecOpsConfig& config, T* vec_out); + // Deprecated slice API + template + eIcicleError slice( + const T* vec_in, + uint64_t offset, + uint64_t stride, + uint64_t size, + const VecOpsConfig& config, + T* vec_out); + /** * @brief Finds the highest non-zero index in a vector or batch of vectors. * diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp index c722c595a..4606a1f8c 100644 --- a/icicle/src/vec_ops.cpp +++ b/icicle/src/vec_ops.cpp @@ -431,12 +431,12 @@ namespace icicle { uint64_t denumerator_size, uint64_t q_size, uint64_t r_size, - const VecOpsConfig* config, + const VecOpsConfig& config, scalar_t* q_out /*OUT*/, scalar_t* r_out /*OUT*/) { return ScalarPolyDivDispatcher::execute( - numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, *config, q_out, r_out); + numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, config, q_out, r_out); } template <> @@ -454,7 +454,7 @@ namespace icicle { scalar_t* r_out /*OUT*/) { return CONCAT_EXPAND(FIELD, poly_division)( - numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, &config, q_out, r_out); + numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, config, q_out, r_out); } } // namespace icicle \ No newline at end of file diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 4c079a80c..95d673f8a 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -784,13 +784,13 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx) { int seed = time(0); srand(seed); - // ICICLE_LOG_DEBUG << "seed = " << seed; - // const uint64_t N = 1 << (rand() % 15 + 3); - // const int batch_size = 1 << (rand() % 5); - // const bool columns_batch = rand() % 2; - const uint64_t N = 1 << (8); - const int batch_size = 1 << (3); - const bool columns_batch = 1; + ICICLE_LOG_DEBUG << "seed = " << seed; + const uint64_t N = 1 << (rand() % 15 + 3); + const int batch_size = 1 << (rand() % 5); + const bool columns_batch = rand() % 2; + // const uint64_t N = 1 << (20); + // const int batch_size = 1 << (0); + // const bool columns_batch = 0; const int total_size = N * batch_size; auto in_a = std::make_unique(total_size); @@ -816,7 +816,7 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx) // Initialize each entire vector with 1 at a random index. The highest non-zero index is the index with 1 for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { - if (!s_is_cuda_registered) { out_ref[idx_in_batch] = static_cast(rand() % N); } // highest_non_zero_idx + out_ref[idx_in_batch] = static_cast(rand() % N); // highest_non_zero_idx for (uint32_t i = 0; i < N; i++) { if (columns_batch) { in_a[idx_in_batch + batch_size * i] = TypeParam::from(i == out_ref[idx_in_batch] ? 1 : 0); @@ -830,9 +830,9 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx) // std::cout << "out_main:\t["; for (int i = 0; i < batch_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout // < Date: Thu, 31 Oct 2024 13:19:31 +0200 Subject: [PATCH 22/43] slice and poly_dev apis deprecated use new ones with warning --- icicle/include/icicle/vec_ops.h | 33 ++++++++----- icicle/src/vec_ops.cpp | 85 ++++++++++++++++++++------------- 2 files changed, 73 insertions(+), 45 deletions(-) diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h index bf5eab324..2868aa682 100644 --- a/icicle/include/icicle/vec_ops.h +++ b/icicle/include/icicle/vec_ops.h @@ -232,8 +232,7 @@ namespace icicle { * @note To subtract a scalar from each element of a vector - use scalar_add_vec with negative scalar. */ template - eIcicleError scalar_add_vec( - const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output); + eIcicleError scalar_add_vec(const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output); /** * @brief Subtracts each element of a vector from a scalar, elementwise (res[i]=scalar-vec[i]). @@ -252,8 +251,7 @@ namespace icicle { * @note To subtract a scalar from each element of a vector - use scalar_add_vec with negative scalar. */ template - eIcicleError scalar_sub_vec( - const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output); + eIcicleError scalar_sub_vec(const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output); /** * @brief Multiplies each element of a vector by a scalar. @@ -271,8 +269,7 @@ namespace icicle { * @return eIcicleError Error code indicating success or failure. */ template - eIcicleError scalar_mul_vec( - const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output); + eIcicleError scalar_mul_vec(const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output); // Matrix operations @@ -344,13 +341,8 @@ namespace icicle { // Deprecated slice API template - eIcicleError slice( - const T* vec_in, - uint64_t offset, - uint64_t stride, - uint64_t size, - const VecOpsConfig& config, - T* vec_out); + eIcicleError + slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size_out, const VecOpsConfig& config, T* vec_out); /** * @brief Finds the highest non-zero index in a vector or batch of vectors. @@ -406,9 +398,11 @@ namespace icicle { * contiguously. * - If `config.columns_batch` is `true`, coefficients are interleaved. * @param numerator_deg Degree of the numerator polynomial. + * @param numerator_size size (number of T elements) in numerator vec * @param denominator Pointer to the array of coefficients of the denominator polynomial(s). * - Storage layout is similar to `numerator`. * @param denominator_deg Degree of the denominator polynomial. + * @param denominator_size size (number of T elements) in denumerator vec * @param config Configuration for the operation. * @param q_size Size of the quotient array for one polynomial. * @param r_size Size of the remainder array. @@ -437,4 +431,17 @@ namespace icicle { T* q_out /*OUT*/, T* r_out /*OUT*/); + // deprecated API + template + eIcicleError polynomial_division( + const T* numerator, + int64_t numerator_deg, + const T* denumerator, + int64_t denumerator_deg, + const VecOpsConfig& config, + T* q_out /*OUT*/, + uint64_t q_size, + T* r_out /*OUT*/, + uint64_t r_size); + } // namespace icicle \ No newline at end of file diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp index 4606a1f8c..c8b867470 100644 --- a/icicle/src/vec_ops.cpp +++ b/icicle/src/vec_ops.cpp @@ -184,22 +184,14 @@ namespace icicle { ICICLE_DISPATCHER_INST(ScalarAddDispatcher, scalar_add_vec, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_add_vec)( - const scalar_t* scalar_a, - const scalar_t* vec_b, - uint64_t size, - const VecOpsConfig* config, - scalar_t* output) + const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output) { return ScalarAddDispatcher::execute(scalar_a, vec_b, size, *config, output); } template <> eIcicleError scalar_add_vec( - const scalar_t* scalar_a, - const scalar_t* vec_b, - uint64_t size, - const VecOpsConfig& config, - scalar_t* output) + const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output) { return CONCAT_EXPAND(FIELD, scalar_add_vec)(scalar_a, vec_b, size, &config, output); } @@ -208,22 +200,14 @@ namespace icicle { ICICLE_DISPATCHER_INST(ScalarSubDispatcher, scalar_sub_vec, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_sub_vec)( - const scalar_t* scalar_a, - const scalar_t* vec_b, - uint64_t size, - const VecOpsConfig* config, - scalar_t* output) + const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output) { return ScalarSubDispatcher::execute(scalar_a, vec_b, size, *config, output); } template <> eIcicleError scalar_sub_vec( - const scalar_t* scalar_a, - const scalar_t* vec_b, - uint64_t size, - const VecOpsConfig& config, - scalar_t* output) + const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output) { return CONCAT_EXPAND(FIELD, scalar_sub_vec)(scalar_a, vec_b, size, &config, output); } @@ -231,22 +215,14 @@ namespace icicle { ICICLE_DISPATCHER_INST(ScalarMulDispatcher, scalar_mul_vec, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_mul_vec)( - const scalar_t* scalar_a, - const scalar_t* vec_b, - uint64_t size, - const VecOpsConfig* config, - scalar_t* output) + const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output) { return ScalarMulDispatcher::execute(scalar_a, vec_b, size, *config, output); } template <> eIcicleError scalar_mul_vec( - const scalar_t* scalar_a, - const scalar_t* vec_b, - uint64_t size, - const VecOpsConfig& config, - scalar_t* output) + const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output) { return CONCAT_EXPAND(FIELD, scalar_mul_vec)(scalar_a, vec_b, size, &config, output); } @@ -347,6 +323,25 @@ namespace icicle { return CONCAT_EXPAND(FIELD, slice)(input, offset, stride, size_in, size_out, &config, output); } + // Deprecated API + template <> + eIcicleError slice( + const scalar_t* input, + uint64_t offset, + uint64_t stride, + uint64_t size_out, + const VecOpsConfig& config, + scalar_t* output) + { + const auto size_in = offset + stride * (size_out - 1) + 1; // input should be at least that large + ICICLE_LOG_WARNING << "slice api is deprecated and replace with new api. Use new slice api instead"; + if (config.batch_size != 1) { + ICICLE_LOG_ERROR << "deprecated slice API does not support batch"; + return eIcicleError::INVALID_ARGUMENT; + } + return slice(input, offset, stride, size_in, size_out, config, output); + } + #ifdef EXT_FIELD ICICLE_DISPATCHER_INST(ExtFieldSliceDispatcher, extension_slice, extFieldSliceOpImpl) @@ -436,7 +431,8 @@ namespace icicle { scalar_t* r_out /*OUT*/) { return ScalarPolyDivDispatcher::execute( - numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, config, q_out, r_out); + numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, config, + q_out, r_out); } template <> @@ -454,7 +450,32 @@ namespace icicle { scalar_t* r_out /*OUT*/) { return CONCAT_EXPAND(FIELD, poly_division)( - numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, config, q_out, r_out); + numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, config, + q_out, r_out); + } + + // Deprecated API + template <> + eIcicleError polynomial_division( + const scalar_t* numerator, + int64_t numerator_deg, + const scalar_t* denumerator, + int64_t denumerator_deg, + const VecOpsConfig& config, + scalar_t* q_out /*OUT*/, + uint64_t q_size, + scalar_t* r_out /*OUT*/, + uint64_t r_size) + { + ICICLE_LOG_WARNING + << "polynomial_division api is deprecated and replace with new api. Use new polynomial_division api instead"; + if (config.batch_size != 1) { + ICICLE_LOG_ERROR << "deprecated polynomial_division API does not support batch"; + return eIcicleError::INVALID_ARGUMENT; + } + return polynomial_division( + numerator, numerator_deg, numerator_deg + 1, denumerator, denumerator_deg, denumerator_deg + 1, q_size, r_size, + config, q_out, r_out); } } // namespace icicle \ No newline at end of file From 916618ce079d6b6fb6860ee705737c04798fd0d6 Mon Sep 17 00:00:00 2001 From: hadaringonyama Date: Thu, 31 Oct 2024 15:58:56 +0200 Subject: [PATCH 23/43] poly eval WIP --- icicle/tests/test_field_api.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 95d673f8a..a4d20ce54 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -841,11 +841,17 @@ TYPED_TEST(FieldApiTest, polynomialEval) { int seed = time(0); srand(seed); - // ICICLE_LOG_DEBUG << "seed = " << seed; + ICICLE_LOG_DEBUG << "seed = " << seed; + // const uint64_t coeffs_size = 1 << (rand() % 10 + 4); + // const uint64_t domain_size = 1 << (rand() % 8 + 2); + // const int batch_size = 1 << (rand() % 5); + // const bool columns_batch = rand() % 2; + const uint64_t coeffs_size = 1 << (rand() % 10 + 4); const uint64_t domain_size = 1 << (rand() % 8 + 2); - const int batch_size = 1 << (rand() % 5); - const bool columns_batch = rand() % 2; + const int batch_size = 1 << (0); + const bool columns_batch = 0; + const int total_coeffs_size = coeffs_size * batch_size; auto in_coeffs = std::make_unique(total_coeffs_size); From f033bdbda2ca1aa6e53c23b6c2e0cd615271a6e4 Mon Sep 17 00:00:00 2001 From: hadaringonyama Date: Thu, 31 Oct 2024 17:47:09 +0200 Subject: [PATCH 24/43] poly eval passes --- icicle/tests/test_field_api.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index a4d20ce54..82ab117b7 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -842,15 +842,15 @@ TYPED_TEST(FieldApiTest, polynomialEval) int seed = time(0); srand(seed); ICICLE_LOG_DEBUG << "seed = " << seed; - // const uint64_t coeffs_size = 1 << (rand() % 10 + 4); - // const uint64_t domain_size = 1 << (rand() % 8 + 2); - // const int batch_size = 1 << (rand() % 5); - // const bool columns_batch = rand() % 2; - const uint64_t coeffs_size = 1 << (rand() % 10 + 4); const uint64_t domain_size = 1 << (rand() % 8 + 2); - const int batch_size = 1 << (0); - const bool columns_batch = 0; + const int batch_size = 1 << (rand() % 5); + const bool columns_batch = rand() % 2; + + // const uint64_t coeffs_size = 1 << (3); + // const uint64_t domain_size = 3; + // const int batch_size = 1 << (1); + // const bool columns_batch = 1; const int total_coeffs_size = coeffs_size * batch_size; From 35d2e2384faa731b9651c7beda8165b50bf24150 Mon Sep 17 00:00:00 2001 From: hadaringonyama Date: Thu, 31 Oct 2024 20:57:09 +0200 Subject: [PATCH 25/43] fix types + --- icicle/tests/test_field_api.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 82ab117b7..49fe79709 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -909,7 +909,8 @@ TYPED_TEST(FieldApiTest, polynomialDivision) const int64_t denumerator_deg = 2; const uint64_t q_size = 2; const uint64_t r_size = 4; - const int batch_size = 1 << (rand() % 5); + // const int batch_size = 1 << (rand() % 5); + const int batch_size = 1; const bool columns_batch = rand() % 2; const int64_t total_numerator_size = (numerator_deg + 1) * batch_size; @@ -995,13 +996,13 @@ TYPED_TEST(FieldApiTest, polynomialDivision) if (s_is_cuda_registered) { run(s_reference_target, q_out_ref.get(), r_out_ref.get(), VERBOSE /*=measure*/, "polynomial_division", 1); } - // std::cout << "numerator:\t["; for (int i = 0; i < total_numerator_size-1; i++) { std::cout << numerator[i] << ", "; - // } std::cout < Date: Thu, 31 Oct 2024 21:17:53 +0200 Subject: [PATCH 26/43] tidy up --- icicle/tests/test_field_api.cpp | 410 +++++++++++++------------------- 1 file changed, 168 insertions(+), 242 deletions(-) diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 49fe79709..e43c60fff 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -17,6 +17,8 @@ using namespace field_config; using namespace icicle; +//TODO - add tests that test different configurations of data on device or on host. + using FpMicroseconds = std::chrono::duration; #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now(); #define END_TIMER(timer, msg, enable) \ @@ -28,7 +30,6 @@ static bool VERBOSE = true; static int ITERS = 1; static inline std::string s_main_target; static inline std::string s_reference_target; -// static const bool s_is_cuda_registered = is_device_registered("CUDA"); bool s_is_cuda_registered; template @@ -95,18 +96,18 @@ TYPED_TEST(FieldApiTest, vectorVectorOps) srand(seed); ICICLE_LOG_DEBUG << "seed = " << seed; const uint64_t N = 1 << (rand() % 15 + 3); - // const uint64_t N = 1 << (3); const int batch_size = 1 << (rand() % 5); - // const int batch_size = 2; const bool columns_batch = rand() % 2; + + ICICLE_LOG_DEBUG << "N = " << N; + ICICLE_LOG_DEBUG << "batch_size = " << batch_size; + ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; + const int total_size = N * batch_size; auto in_a = std::make_unique(total_size); auto in_b = std::make_unique(total_size); auto out_main = std::make_unique(total_size); auto out_ref = std::make_unique(total_size); - ICICLE_LOG_DEBUG << "N = " << N; - ICICLE_LOG_DEBUG << "batch_size = " << batch_size; - ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; auto vector_accumulate_wrapper = [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) { @@ -131,19 +132,7 @@ TYPED_TEST(FieldApiTest, vectorVectorOps) END_TIMER(VECADD_sync, oss.str().c_str(), measure); }; - // warmup - // run(s_reference_target, out_ref.get(), false /*=measure*/, 16 /*=iters*/); - // run(s_main_target, out_main.get(), false /*=measure*/, 1 /*=iters*/); - - // warmup - // run(s_reference_target, out_ref.get(), false /*=measure*/, 16 /*=iters*/); - // run(s_main_target, out_main.get(), false /*=measure*/, 1 /*=iters*/); - - // Element-wise vector operations - // If config.batch_size>1, (columns_batch=true or false) the operation is done element-wise anyway, so it doesn't - // affect the test - - // // add + // add FieldApiTest::random_samples(in_a.get(), total_size); FieldApiTest::random_samples(in_b.get(), total_size); if (!s_is_cuda_registered) { @@ -156,25 +145,17 @@ TYPED_TEST(FieldApiTest, vectorVectorOps) run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_add, "vector add", ITERS); ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); - // // accumulate + // accumulate FieldApiTest::random_samples(in_a.get(), total_size); FieldApiTest::random_samples(in_b.get(), total_size); - // if (!s_is_cuda_registered) { - for (int i = 0; i < total_size; i++) { - out_ref[i] = in_a[i] + in_b[i]; - } - // } else { - // run(s_reference_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS); - // } + for (int i = 0; i < total_size; i++) { //TODO - compare gpu against cpu with inplace operations? + out_ref[i] = in_a[i] + in_b[i]; + } run(s_main_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS); - // for (int i = 0; i < total_size; i++) { - // ICICLE_LOG_DEBUG << i << ", " << in_a[i] << ", " << in_b[i] << ", " << out_ref[i]; - // } - ASSERT_EQ(0, memcmp(in_a.get(), out_ref.get(), total_size * sizeof(TypeParam))); - // // sub + // sub FieldApiTest::random_samples(in_a.get(), total_size); FieldApiTest::random_samples(in_b.get(), total_size); if (!s_is_cuda_registered) { @@ -187,7 +168,7 @@ TYPED_TEST(FieldApiTest, vectorVectorOps) run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sub, "vector sub", ITERS); ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); - // // mul + // mul FieldApiTest::random_samples(in_a.get(), total_size); FieldApiTest::random_samples(in_b.get(), total_size); if (!s_is_cuda_registered) { @@ -219,11 +200,15 @@ TYPED_TEST(FieldApiTest, montgomeryConversion) { int seed = time(0); srand(seed); - // ICICLE_LOG_DEBUG << "seed = " << seed; + ICICLE_LOG_DEBUG << "seed = " << seed; const uint64_t N = 1 << (rand() % 15 + 3); const int batch_size = 1 << (rand() % 5); const bool columns_batch = rand() % 2; const bool is_to_montgomery = rand() % 2; + ICICLE_LOG_DEBUG << "N = " << N; + ICICLE_LOG_DEBUG << "batch_size = " << batch_size; + ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; + ICICLE_LOG_DEBUG << "is_to_montgomery = " << is_to_montgomery; const int total_size = N * batch_size; auto in_a = std::make_unique(total_size); auto out_main = std::make_unique(total_size); @@ -246,10 +231,6 @@ TYPED_TEST(FieldApiTest, montgomeryConversion) END_TIMER(MONTGOMERY, oss.str().c_str(), measure); }; - // Element-wise operation - // If config.batch_size>1, (columns_batch=true or false) the addition is done element-wise anyway, so it doesn't - // affect the test - // convert_montgomery FieldApiTest::random_samples(in_a.get(), total_size); // reference @@ -280,10 +261,9 @@ TYPED_TEST(FieldApiTest, VectorReduceOps) const bool columns_batch = rand() % 2; const int total_size = N * batch_size; - // const uint64_t N = 1 << (20); - // const int batch_size = 1 << 4; - // const bool columns_batch = 1; - // const int total_size = N * batch_size; + ICICLE_LOG_DEBUG << "N = " << N; + ICICLE_LOG_DEBUG << "batch_size = " << batch_size; + ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; auto in_a = std::make_unique(total_size); auto out_main = std::make_unique(batch_size); @@ -312,7 +292,7 @@ TYPED_TEST(FieldApiTest, VectorReduceOps) END_TIMER(VECADD_sync, oss.str().c_str(), measure); }; - // // sum + // sum FieldApiTest::random_samples(in_a.get(), total_size); // reference for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { @@ -331,7 +311,7 @@ TYPED_TEST(FieldApiTest, VectorReduceOps) run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sum, "vector sum", ITERS); ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(TypeParam))); - // // product + // product FieldApiTest::random_samples(in_a.get(), total_size); if (!s_is_cuda_registered) { for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { @@ -359,9 +339,9 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) const int batch_size = 1 << (rand() % 5); const bool columns_batch = rand() % 2; - // const uint64_t N = 1 << (4); - // const int batch_size = 7; - // const bool columns_batch = 1; + ICICLE_LOG_DEBUG << "N = " << N; + ICICLE_LOG_DEBUG << "batch_size = " << batch_size; + ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; const int total_size = N * batch_size; auto scalar_a = std::make_unique(batch_size); @@ -395,7 +375,7 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) END_TIMER(VECADD_sync, oss.str().c_str(), measure); }; - // // scalar add vec + // scalar add vec FieldApiTest::random_samples(scalar_a.get(), batch_size); FieldApiTest::random_samples(in_b.get(), total_size); @@ -411,13 +391,6 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_add_vec, "scalar add vec", ITERS); } run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_add_vec, "scalar add vec", ITERS); - - - // ICICLE_LOG_DEBUG << scalar_a[0] << ", "; - // ICICLE_LOG_DEBUG << scalar_a[1] << ", "; - // for (int i = 0; i < total_size; i++) { - // ICICLE_LOG_DEBUG << i << ", " << in_b[i] << ", " << out_main[i] << ", " << out_ref[i]; - // } ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); @@ -439,7 +412,7 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_sub_vec, "scalar sub vec", ITERS); ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); - // // scalar mul vec + // scalar mul vec FieldApiTest::random_samples(scalar_a.get(), batch_size); FieldApiTest::random_samples(in_b.get(), total_size); @@ -461,23 +434,20 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync) { int seed = time(0); srand(seed); - // ICICLE_LOG_DEBUG << "seed = " << seed; + ICICLE_LOG_DEBUG << "seed = " << seed; const int R = - 1 << (rand() % 8 + 2); // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2 + 1 << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes wich are not powers of 2 const int C = - 1 << (rand() % 8 + 2); // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2 + 1 << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes wich are not powers of 2 const int batch_size = 1 << (rand() % 4); const bool columns_batch = rand() % 2; const bool is_in_place = s_is_cuda_registered? 0 : rand() % 2; //TODO - fix inplace (Hadar: I'm not sure we should support it) - // const int R = 4; // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2 - // const int C = 3; - // const int batch_size = 1 << (1); - // const bool columns_batch = 1; - // const bool is_in_place = 1; + ICICLE_LOG_DEBUG << "rows = " << R; + ICICLE_LOG_DEBUG << "cols = " << C; + ICICLE_LOG_DEBUG << "batch_size = " << batch_size; + ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; - // ICICLE_LOG_DEBUG << "R = " << R << ", C = " << C << ", batch_size = " << batch_size << ", columns_batch = " << - // columns_batch << ", is_in_place = " << is_in_place; //TODO SHANIE - remove this const int total_size = R * C * batch_size; auto h_inout = std::make_unique(total_size); auto h_out_main = std::make_unique(total_size); @@ -527,7 +497,7 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync) } }; - // // Option 1: Initialize each input matrix in the batch with the same ascending values + // Option 1: Initialize each input matrix in the batch with the same ascending values // for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { // for (uint32_t i = 0; i < R * C; i++) { // if(columns_batch){ @@ -538,7 +508,7 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync) // } // } - // // Option 2: Initialize the entire input array with ascending values + // Option 2: Initialize the entire input array with ascending values // for (int i = 0; i < total_size; i++) { // h_inout[i] = TypeParam::from(i); // } @@ -568,19 +538,10 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync) run(s_main_target, (is_in_place ? h_inout.get() : h_out_main.get()), VERBOSE /*=measure*/, "transpose", ITERS); - // ICICLE_LOG_DEBUG << scalar_a[0] << ", "; - // for (int i = 0; i < total_size; i++) { - // ICICLE_LOG_DEBUG << i << ", " << h_inout[i] << ", " << h_out_main[i] << ", " << h_out_ref[i]; - // } - if (is_in_place) { ASSERT_EQ(0, memcmp(h_inout.get(), h_out_ref.get(), total_size * sizeof(TypeParam))); } else { - // std::cout << "h_out_main:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << h_out_main[i] << ", "; } - // std::cout <::random_samples(numerator.get(), total_numerator_size); - // FieldApiTest::random_samples(denumerator.get(), total_denumerator_size); - // // Reference implementation - // TODO - Check in comperison with GPU implementation or implement a general reference implementation - - // Option 2: Initialize the numerator and denumerator with chosen example - // And the reference implementation for the example - - for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { - if (columns_batch) { - // numerator = 3x^3+4x^2+5 - numerator[idx_in_batch + 0 * batch_size] = TypeParam::from(5); - numerator[idx_in_batch + 1 * batch_size] = TypeParam::from(0); - numerator[idx_in_batch + 2 * batch_size] = TypeParam::from(4); - numerator[idx_in_batch + 3 * batch_size] = TypeParam::from(3); - // denumerator = x^2-1 - denumerator[idx_in_batch + 0 * batch_size] = TypeParam::from(0) - TypeParam::from(1); - denumerator[idx_in_batch + 1 * batch_size] = TypeParam::from(0); - denumerator[idx_in_batch + 2 * batch_size] = TypeParam::from(1); - if (!s_is_cuda_registered) { - // q_out_ref = 3x+4 - q_out_ref[idx_in_batch + 0 * batch_size] = TypeParam::from(4); - q_out_ref[idx_in_batch + 1 * batch_size] = TypeParam::from(3); - // r_out_ref = 3x+9 - r_out_ref[idx_in_batch + 0 * batch_size] = TypeParam::from(9); - r_out_ref[idx_in_batch + 1 * batch_size] = TypeParam::from(3); - } - } else { - // numerator = 3x^3+4x^2+5 - numerator[idx_in_batch * (numerator_deg + 1) + 0] = TypeParam::from(5); - numerator[idx_in_batch * (numerator_deg + 1) + 1] = TypeParam::from(0); - numerator[idx_in_batch * (numerator_deg + 1) + 2] = TypeParam::from(4); - numerator[idx_in_batch * (numerator_deg + 1) + 3] = TypeParam::from(3); - // denumerator = x^2-1 - denumerator[idx_in_batch * (denumerator_deg + 1) + 0] = TypeParam::from(0) - TypeParam::from(1); - denumerator[idx_in_batch * (denumerator_deg + 1) + 1] = TypeParam::from(0); - denumerator[idx_in_batch * (denumerator_deg + 1) + 2] = TypeParam::from(1); - if (!s_is_cuda_registered) { - // q_out_ref = 3x+4 - q_out_ref[idx_in_batch * q_size + 0] = TypeParam::from(4); - q_out_ref[idx_in_batch * q_size + 1] = TypeParam::from(3); - // r_out_ref = 3x+9 - r_out_ref[idx_in_batch * r_size + 0] = TypeParam::from(9); - r_out_ref[idx_in_batch * r_size + 1] = TypeParam::from(3); - } - } - } +// TYPED_TEST(FieldApiTest, polynomialDivision) +// { +// int seed = time(0); +// srand(seed); +// ICICLE_LOG_DEBUG << "seed = " << seed; +// // const int64_t numerator_deg = 1 << 4; +// // const int64_t denumerator_deg = 1 << 2; +// // const uint64_t q_size = numerator_deg - denumerator_deg + 1; +// // const uint64_t r_size = numerator_deg + 1; +// const int64_t numerator_deg = 3; +// const int64_t denumerator_deg = 2; +// const uint64_t q_size = 2; +// const uint64_t r_size = 4; +// // const int batch_size = 1 << (rand() % 5); +// const int batch_size = 1; +// const bool columns_batch = rand() % 2; + +// const int64_t total_numerator_size = (numerator_deg + 1) * batch_size; +// const int64_t total_denumerator_size = (denumerator_deg + 1) * batch_size; +// const uint64_t total_q_size = q_size * batch_size; +// const uint64_t total_r_size = r_size * batch_size; + +// auto numerator = std::make_unique(total_numerator_size); +// auto denumerator = std::make_unique(total_denumerator_size); +// auto q_out_main = std::make_unique(total_q_size); +// auto r_out_main = std::make_unique(total_r_size); +// auto q_out_ref = std::make_unique(total_q_size); +// auto r_out_ref = std::make_unique(total_r_size); + +// auto run = +// [&](const std::string& dev_type, TypeParam* q_out, TypeParam* r_out, bool measure, const char* msg, int iters) { +// Device dev = {dev_type, 0}; +// icicle_set_device(dev); +// auto config = default_vec_ops_config(); +// config.batch_size = batch_size; +// config.columns_batch = columns_batch; + +// std::ostringstream oss; +// oss << dev_type << " " << msg; + +// START_TIMER(polynomialDivision) +// for (int i = 0; i < iters; ++i) { +// ICICLE_CHECK(polynomial_division( +// numerator.get(), numerator_deg, total_numerator_size, denumerator.get(), denumerator_deg, total_denumerator_size, q_size, r_size, config, q_out, r_out)); +// } +// END_TIMER(polynomialDivision, oss.str().c_str(), measure); +// }; + +// // // Option 1: Initialize input vectors with random values +// // FieldApiTest::random_samples(numerator.get(), total_numerator_size); +// // FieldApiTest::random_samples(denumerator.get(), total_denumerator_size); +// // // Reference implementation +// // TODO - Check in comperison with GPU implementation or implement a general reference implementation + +// // Option 2: Initialize the numerator and denumerator with chosen example +// // And the reference implementation for the example + +// for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { +// if (columns_batch) { +// // numerator = 3x^3+4x^2+5 +// numerator[idx_in_batch + 0 * batch_size] = TypeParam::from(5); +// numerator[idx_in_batch + 1 * batch_size] = TypeParam::from(0); +// numerator[idx_in_batch + 2 * batch_size] = TypeParam::from(4); +// numerator[idx_in_batch + 3 * batch_size] = TypeParam::from(3); +// // denumerator = x^2-1 +// denumerator[idx_in_batch + 0 * batch_size] = TypeParam::from(0) - TypeParam::from(1); +// denumerator[idx_in_batch + 1 * batch_size] = TypeParam::from(0); +// denumerator[idx_in_batch + 2 * batch_size] = TypeParam::from(1); +// if (!s_is_cuda_registered) { +// // q_out_ref = 3x+4 +// q_out_ref[idx_in_batch + 0 * batch_size] = TypeParam::from(4); +// q_out_ref[idx_in_batch + 1 * batch_size] = TypeParam::from(3); +// // r_out_ref = 3x+9 +// r_out_ref[idx_in_batch + 0 * batch_size] = TypeParam::from(9); +// r_out_ref[idx_in_batch + 1 * batch_size] = TypeParam::from(3); +// } +// } else { +// // numerator = 3x^3+4x^2+5 +// numerator[idx_in_batch * (numerator_deg + 1) + 0] = TypeParam::from(5); +// numerator[idx_in_batch * (numerator_deg + 1) + 1] = TypeParam::from(0); +// numerator[idx_in_batch * (numerator_deg + 1) + 2] = TypeParam::from(4); +// numerator[idx_in_batch * (numerator_deg + 1) + 3] = TypeParam::from(3); +// // denumerator = x^2-1 +// denumerator[idx_in_batch * (denumerator_deg + 1) + 0] = TypeParam::from(0) - TypeParam::from(1); +// denumerator[idx_in_batch * (denumerator_deg + 1) + 1] = TypeParam::from(0); +// denumerator[idx_in_batch * (denumerator_deg + 1) + 2] = TypeParam::from(1); +// if (!s_is_cuda_registered) { +// // q_out_ref = 3x+4 +// q_out_ref[idx_in_batch * q_size + 0] = TypeParam::from(4); +// q_out_ref[idx_in_batch * q_size + 1] = TypeParam::from(3); +// // r_out_ref = 3x+9 +// r_out_ref[idx_in_batch * r_size + 0] = TypeParam::from(9); +// r_out_ref[idx_in_batch * r_size + 1] = TypeParam::from(3); +// } +// } +// } - if (s_is_cuda_registered) { - run(s_reference_target, q_out_ref.get(), r_out_ref.get(), VERBOSE /*=measure*/, "polynomial_division", 1); - } - std::cout << "numerator:\t["; for (int i = 0; i < total_numerator_size-1; i++) { std::cout << numerator[i] << ", "; - } std::cout < Date: Thu, 31 Oct 2024 21:30:40 +0200 Subject: [PATCH 27/43] formatting and spelling --- examples/c++/vector-api/example.cpp | 2 +- icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 45 +++++--------------- icicle/tests/test_field_api.cpp | 29 ++++++------- icicle_v3/include/icicle/mmcs.h | 30 +++++++++++++ 4 files changed, 56 insertions(+), 50 deletions(-) create mode 100644 icicle_v3/include/icicle/mmcs.h diff --git a/examples/c++/vector-api/example.cpp b/examples/c++/vector-api/example.cpp index 2a998c5c7..5c4497a64 100644 --- a/examples/c++/vector-api/example.cpp +++ b/examples/c++/vector-api/example.cpp @@ -7,7 +7,7 @@ #include "icicle/utils/log.h" -// SP: I undertstand this code is auto-generated, but I can't get scrip/gen to work. +// SP: I understand this code is auto-generated, but I can't get script/gen to work. extern "C" eIcicleError bn254_vector_product( const bn254::scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result, uint64_t offset, uint64_t stride); diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index f27ab5600..826fb3bd2 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -317,7 +317,7 @@ class VectorOpTask : public TaskBase } } - // Single worker functionality for out of palce matrix transpose + // Single worker functionality for out of place matrix transpose void out_of_place_transpose() { for (uint32_t k = 0; k < m_nof_operations; ++k) { @@ -367,8 +367,8 @@ class VectorOpTask : public TaskBase public: T m_intermidiate_res; // pointer to the output. Can be a vector or scalar pointer - uint64_t m_idx_in_batch; // index in the batch. Used in intermidiate res tasks -}; // class VectorOpTask + uint64_t m_idx_in_batch; // index in the batch. Used in intermediate res tasks +}; // class VectorOpTask #define NOF_OPERATIONS_PER_TASK 512 #define CONFIG_NOF_THREADS_KEY "n_threads" @@ -401,12 +401,7 @@ cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t size, // Execute a full task from the type vector = scalar (op) vector template eIcicleError cpu_scalar_vector_op( - VecOperation op, - const T* scalar_a, - const T* vec_b, - uint64_t size, - const VecOpsConfig& config, - T* output) + VecOperation op, const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { TasksManager> task_manager(get_nof_workers(config) - 1); const uint64_t total_nof_operations = size; @@ -416,11 +411,8 @@ eIcicleError cpu_scalar_vector_op( VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); task_p->send_2ops_task( op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), scalar_a + idx_in_batch, - config.columns_batch ? vec_b + idx_in_batch + i * config.batch_size - : vec_b + idx_in_batch * size + i, - stride, - config.columns_batch ? output + idx_in_batch + i * config.batch_size - : output + idx_in_batch * size + i); + config.columns_batch ? vec_b + idx_in_batch + i * config.batch_size : vec_b + idx_in_batch * size + i, stride, + config.columns_batch ? output + idx_in_batch + i * config.batch_size : output + idx_in_batch * size + i); } } task_manager.wait_done(); @@ -590,12 +582,7 @@ REGISTER_VECTOR_PRODUCT_BACKEND("CPU", cpu_vector_product); /*********************************** Scalar + Vector***********************************/ template eIcicleError cpu_scalar_add( - const Device& device, - const T* scalar_a, - const T* vec_b, - uint64_t size, - const VecOpsConfig& config, - T* output) + const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { return cpu_scalar_vector_op(VecOperation::SCALAR_ADD_VEC, scalar_a, vec_b, size, config, output); } @@ -605,12 +592,7 @@ REGISTER_SCALAR_ADD_VEC_BACKEND("CPU", cpu_scalar_add); /*********************************** Scalar - Vector***********************************/ template eIcicleError cpu_scalar_sub( - const Device& device, - const T* scalar_a, - const T* vec_b, - uint64_t size, - const VecOpsConfig& config, - T* output) + const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { return cpu_scalar_vector_op(VecOperation::SCALAR_SUB_VEC, scalar_a, vec_b, size, config, output); } @@ -620,12 +602,7 @@ REGISTER_SCALAR_SUB_VEC_BACKEND("CPU", cpu_scalar_sub); /*********************************** MUL BY SCALAR***********************************/ template eIcicleError cpu_scalar_mul( - const Device& device, - const T* scalar_a, - const T* vec_b, - uint64_t size, - const VecOpsConfig& config, - T* output) + const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { return cpu_scalar_vector_op(VecOperation::SCALAR_MUL_VEC, scalar_a, vec_b, size, config, output); } @@ -669,7 +646,7 @@ uint32_t gcd(uint32_t a, uint32_t b) return a; } -// Recursive function to generate all k-ary necklaces and to replace the elements withing the necklaces +// Recursive function to generate all k-ary necklaces and to replace the elements within the necklaces template void gen_necklace( uint32_t t, @@ -714,7 +691,7 @@ eIcicleError matrix_transpose_necklaces( uint32_t gcd_value = gcd(log_nof_rows, log_nof_cols); uint32_t k = 1 << gcd_value; // Base of necklaces uint32_t length = - (log_nof_cols + log_nof_rows) / gcd_value; // length of necklaces. Since all are powers of 2, equvalent to + (log_nof_cols + log_nof_rows) / gcd_value; // length of necklaces. Since all are powers of 2, equivalent to // (log_nof_cols + log_nof_rows) / gcd_value; const uint64_t max_nof_operations = NOF_OPERATIONS_PER_TASK / length; const uint64_t total_elements_one_mat = static_cast(nof_rows) * nof_cols; diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index e43c60fff..1c44464cb 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -17,7 +17,7 @@ using namespace field_config; using namespace icicle; -//TODO - add tests that test different configurations of data on device or on host. +// TODO - add tests that test different configurations of data on device or on host. using FpMicroseconds = std::chrono::duration; #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now(); @@ -98,11 +98,11 @@ TYPED_TEST(FieldApiTest, vectorVectorOps) const uint64_t N = 1 << (rand() % 15 + 3); const int batch_size = 1 << (rand() % 5); const bool columns_batch = rand() % 2; - + ICICLE_LOG_DEBUG << "N = " << N; ICICLE_LOG_DEBUG << "batch_size = " << batch_size; ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; - + const int total_size = N * batch_size; auto in_a = std::make_unique(total_size); auto in_b = std::make_unique(total_size); @@ -148,7 +148,7 @@ TYPED_TEST(FieldApiTest, vectorVectorOps) // accumulate FieldApiTest::random_samples(in_a.get(), total_size); FieldApiTest::random_samples(in_b.get(), total_size); - for (int i = 0; i < total_size; i++) { //TODO - compare gpu against cpu with inplace operations? + for (int i = 0; i < total_size; i++) { // TODO - compare gpu against cpu with inplace operations? out_ref[i] = in_a[i] + in_b[i]; } run(s_main_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS); @@ -391,7 +391,7 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_add_vec, "scalar add vec", ITERS); } run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_add_vec, "scalar add vec", ITERS); - + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); // scalar sub vec @@ -436,12 +436,13 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync) srand(seed); ICICLE_LOG_DEBUG << "seed = " << seed; const int R = - 1 << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes wich are not powers of 2 + 1 << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes which are not powers of 2 const int C = - 1 << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes wich are not powers of 2 + 1 << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes which are not powers of 2 const int batch_size = 1 << (rand() % 4); const bool columns_batch = rand() % 2; - const bool is_in_place = s_is_cuda_registered? 0 : rand() % 2; //TODO - fix inplace (Hadar: I'm not sure we should support it) + const bool is_in_place = + s_is_cuda_registered ? 0 : rand() % 2; // TODO - fix inplace (Hadar: I'm not sure we should support it) ICICLE_LOG_DEBUG << "rows = " << R; ICICLE_LOG_DEBUG << "cols = " << C; @@ -777,12 +778,12 @@ TYPED_TEST(FieldApiTest, polynomialEval) const uint64_t domain_size = 1 << (rand() % 8 + 2); const int batch_size = 1 << (rand() % 5); const bool columns_batch = rand() % 2; - + ICICLE_LOG_DEBUG << "coeffs_size = " << coeffs_size; ICICLE_LOG_DEBUG << "domain_size = " << domain_size; ICICLE_LOG_DEBUG << "batch_size = " << batch_size; ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; - + const int total_coeffs_size = coeffs_size * batch_size; auto in_coeffs = std::make_unique(total_coeffs_size); @@ -815,10 +816,7 @@ TYPED_TEST(FieldApiTest, polynomialEval) run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "polynomial_eval", 1); if (s_is_cuda_registered) { run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "polynomial_eval", 1); - ASSERT_EQ( - 0, memcmp( - out_main.get(), out_ref.get(), - total_coeffs_size * sizeof(TypeParam))); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_coeffs_size * sizeof(TypeParam))); } } @@ -865,7 +863,8 @@ TYPED_TEST(FieldApiTest, polynomialEval) // START_TIMER(polynomialDivision) // for (int i = 0; i < iters; ++i) { // ICICLE_CHECK(polynomial_division( -// numerator.get(), numerator_deg, total_numerator_size, denumerator.get(), denumerator_deg, total_denumerator_size, q_size, r_size, config, q_out, r_out)); +// numerator.get(), numerator_deg, total_numerator_size, denumerator.get(), denumerator_deg, +// total_denumerator_size, q_size, r_size, config, q_out, r_out)); // } // END_TIMER(polynomialDivision, oss.str().c_str(), measure); // }; diff --git a/icicle_v3/include/icicle/mmcs.h b/icicle_v3/include/icicle/mmcs.h new file mode 100644 index 000000000..94394b822 --- /dev/null +++ b/icicle_v3/include/icicle/mmcs.h @@ -0,0 +1,30 @@ +#pragma once + +#include "errors.h" +#include "runtime.h" +#include "hash.h" +#include "merkle_tree.h" +#include "icicle/utils/utils.h" + +#include +#include + + +template + struct Matrix { + T* values; + size_t width; + size_t height; + }; + +eIcicleError build_mmcs_tree(const Matrix* inputs, + const unsigned int number_of_inputs, + limb_t** outputs, + const Hash& hash, + const Hash& compression, + const MerkleTreeConfig& config); + + //create hash <-hasher,compressor + + //sort, and call merkle tree + //how to return outputs? \ No newline at end of file From 32bd7808d18e280c2df2017dc70a87cd85f0f6f1 Mon Sep 17 00:00:00 2001 From: hadaringonyama Date: Thu, 31 Oct 2024 21:51:01 +0200 Subject: [PATCH 28/43] ntt test --- icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 2 +- icicle/tests/test_field_api.cpp | 226 ++++++++----------- 2 files changed, 97 insertions(+), 131 deletions(-) diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index 826fb3bd2..24e53fa59 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -350,7 +350,7 @@ class VectorOpTask : public TaskBase VecOperation m_operation; // the operation to execute uint32_t m_nof_operations; // number of operations to execute for this task - const T* m_op_a; // pointer to operand A. Operand A is a vector, or metrix in case of replace_elements + const T* m_op_a; // pointer to operand A. Operand A is a vector, or matrix in case of replace_elements const T* m_op_b; // pointer to operand B. Operand B is a vector or scalar uint64_t m_start_index; // index used in bitreverse operation and out of place matrix transpose uint64_t m_stop_index; // index used in reduce operations and out of place matrix transpose diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 1c44464cb..e45a3ae0b 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -933,138 +933,104 @@ TYPED_TEST(FieldApiTest, polynomialEval) // ASSERT_EQ(0, memcmp(r_out_main.get(), r_out_ref.get(), total_r_size * sizeof(TypeParam))); // } -// #ifdef NTT -// TYPED_TEST(FieldApiTest, ntt) -// { -// // ICICLE_LOG_INFO << "Current branch: " << get_current_branch(); -// ICICLE_LOG_DEBUG << "ICICLE_LOG_DEBUG"; -// // for (int i = 3; i < 23; ++i) { -// // //Randomize configuration - -// // int seed = time(0) + i; -// // // int seed = 1726493105; -// // srand(seed); -// // const bool inplace = rand() % 2; -// // const int logn = rand() % 17 + 3; -// // // const int logn = rand() % 14 + 3; -// // // const int logn = 16; -// // const uint64_t N = 1 << logn; -// // const int log_ntt_domain_size = logn + 1; -// // const int log_batch_size = rand() % 3; -// // const int batch_size = 1 << log_batch_size; -// // const Ordering ordering = static_cast(rand() % 4); -// // bool columns_batch; -// // if (logn == 7 || logn < 4) { -// // columns_batch = false; // currently not supported (icicle_v3/backend/cuda/src/ntt/ntt.cuh line 578) -// // } else { -// // // columns_batch = true; -// // columns_batch = rand() % 2; -// // } -// // // const NTTDir dir = static_cast(rand() % 2); // 0: forward, 1: inverse -// // const NTTDir dir = static_cast(0); // 0: forward, 1: inverse -// // const int log_coset_stride = rand() % 3; -// // scalar_t coset_gen; -// // if (log_coset_stride) { -// // coset_gen = scalar_t::omega(logn + log_coset_stride); -// // } else { -// // coset_gen = scalar_t::one(); -// // } - -// const bool inplace = false; -// const int logn = 15; -// const uint64_t N = 1 << logn; -// const int log_ntt_domain_size = logn; -// const int log_batch_size = 0; -// const int batch_size = 1 << log_batch_size; -// const Ordering ordering = static_cast(0); -// bool columns_batch = false; -// const NTTDir dir = static_cast(0); // 0: forward, 1: inverse -// const int log_coset_stride = 0; -// scalar_t coset_gen; -// if (log_coset_stride) { -// coset_gen = scalar_t::omega(logn + log_coset_stride); -// } else { -// coset_gen = scalar_t::one(); -// } +#ifdef NTT -// // TODO SHANIE : remove -// // ICICLE_LOG_INFO << "NTT test: seed=" << seed; -// // ICICLE_LOG_INFO << "NTT test: omega=" << scalar_t::omega(logn); -// // ICICLE_LOG_INFO << "NTT test:s inplace=" << inplace; -// ICICLE_LOG_INFO << "NTT test: logn=" << logn; -// // ICICLE_LOG_INFO << "NTT test: log_ntt_domain_size=" << log_ntt_domain_size; -// // ICICLE_LOG_INFO << "NTT test: log_batch_size=" << log_batch_size; -// // ICICLE_LOG_INFO << "NTT test: columns_batch=" << columns_batch; -// // ICICLE_LOG_INFO << "NTT test: ordering=" << int(ordering); -// ICICLE_LOG_INFO << "NTT test: dir=" << (dir == NTTDir::kForward ? "forward" : "inverse"); -// ICICLE_LOG_INFO << "NTT test: log_coset_stride=" << log_coset_stride; -// ICICLE_LOG_INFO << "NTT test: coset_gen=" << coset_gen; - -// const int total_size = N * batch_size; -// auto scalars = std::make_unique(total_size); -// FieldApiTest::random_samples(scalars.get(), total_size); -// // for (int i = 0; i < total_size; i++) { scalars[i] = scalar_t::from(i); } //FIXME SHANIE: remove -// auto out_main = std::make_unique(total_size); -// auto out_ref = std::make_unique(total_size); -// auto run = [&](const std::string& dev_type, TypeParam* out, const char* msg, bool measure, int iters) { -// Device dev = {dev_type, 0}; -// icicle_set_device(dev); -// icicleStreamHandle stream = nullptr; -// ICICLE_CHECK(icicle_create_stream(&stream)); -// auto init_domain_config = default_ntt_init_domain_config(); -// init_domain_config.stream = stream; -// init_domain_config.is_async = false; -// ConfigExtension ext; -// ext.set(CudaBackendConfig::CUDA_NTT_FAST_TWIDDLES_MODE, true); -// init_domain_config.ext = &ext; -// auto config = default_ntt_config(); -// config.stream = stream; -// config.coset_gen = coset_gen; -// config.batch_size = batch_size; // default: 1 -// config.columns_batch = columns_batch; // default: false -// config.ordering = ordering; // default: kNN -// config.are_inputs_on_device = true; -// config.are_outputs_on_device = true; -// config.is_async = false; -// ICICLE_CHECK(ntt_init_domain(scalar_t::omega(log_ntt_domain_size), init_domain_config)); -// TypeParam *d_in, *d_out; -// ICICLE_CHECK(icicle_malloc_async((void**)&d_in, total_size * sizeof(TypeParam), config.stream)); -// ICICLE_CHECK(icicle_malloc_async((void**)&d_out, total_size * sizeof(TypeParam), config.stream)); -// ICICLE_CHECK(icicle_copy_to_device_async(d_in, scalars.get(), total_size * sizeof(TypeParam), config.stream)); -// std::ostringstream oss; -// oss << dev_type << " " << msg; -// START_TIMER(NTT_sync) -// for (int i = 0; i < iters; ++i) { -// if (inplace) { -// ICICLE_CHECK(ntt(d_in, N, dir, config, d_in)); -// } else { -// ICICLE_CHECK(ntt(d_in, N, dir, config, d_out)); -// } -// } -// END_TIMER(NTT_sync, oss.str().c_str(), measure); +TYPED_TEST(FieldApiTest, ntt) +{ + // Randomize configuration -// if (inplace) { -// ICICLE_CHECK(icicle_copy_to_host_async(out, d_in, total_size * sizeof(TypeParam), config.stream)); -// } else { -// ICICLE_CHECK(icicle_copy_to_host_async(out, d_out, total_size * sizeof(TypeParam), config.stream)); -// } -// ICICLE_CHECK(icicle_free_async(d_in, config.stream)); -// ICICLE_CHECK(icicle_free_async(d_out, config.stream)); -// ICICLE_CHECK(icicle_stream_synchronize(config.stream)); -// ICICLE_CHECK(icicle_destroy_stream(stream)); -// ICICLE_CHECK(ntt_release_domain()); -// }; -// // run(s_main_target, out_main.get(), "ntt", false /*=measure*/, 0 /*=iters*/); // warmup -// run(s_reference_target, out_ref.get(), "V3ntt", VERBOSE /*=measure*/, 10 /*=iters*/); -// run(s_main_target, out_main.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/); -// // std::cout << "left:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout -// <()); + }; + run(s_main_target, out_main.get(), "ntt", false /*=measure*/, 10 /*=iters*/); // warmup + run(s_reference_target, out_ref.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/); + run(s_main_target, out_main.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t))); +} +#endif // NTT int main(int argc, char** argv) { From 5291608f40cecb25a97ab568386fd71a3c638d19 Mon Sep 17 00:00:00 2001 From: hadaringonyama Date: Sat, 2 Nov 2024 11:13:54 +0200 Subject: [PATCH 29/43] debug eval bug --- icicle/tests/test_field_api.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index e45a3ae0b..faf1d0d8d 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -774,10 +774,15 @@ TYPED_TEST(FieldApiTest, polynomialEval) int seed = time(0); srand(seed); ICICLE_LOG_DEBUG << "seed = " << seed; - const uint64_t coeffs_size = 1 << (rand() % 10 + 4); - const uint64_t domain_size = 1 << (rand() % 8 + 2); - const int batch_size = 1 << (rand() % 5); - const bool columns_batch = rand() % 2; + // const uint64_t coeffs_size = 1 << (rand() % 10 + 4); + // const uint64_t domain_size = 1 << (rand() % 8 + 2); + // const int batch_size = 1 << (rand() % 5); + // const bool columns_batch = rand() % 2; + + const uint64_t coeffs_size = 3; + const uint64_t domain_size = 4; + const int batch_size = 1; + const bool columns_batch = 0; ICICLE_LOG_DEBUG << "coeffs_size = " << coeffs_size; ICICLE_LOG_DEBUG << "domain_size = " << domain_size; From b7b26ecf24055e85df4ae20fc0ffc6387695f38c Mon Sep 17 00:00:00 2001 From: hadaringonyama Date: Sat, 2 Nov 2024 11:37:27 +0200 Subject: [PATCH 30/43] eval bug solved --- icicle/tests/test_field_api.cpp | 42 +++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index faf1d0d8d..ce54247a3 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -774,15 +774,18 @@ TYPED_TEST(FieldApiTest, polynomialEval) int seed = time(0); srand(seed); ICICLE_LOG_DEBUG << "seed = " << seed; - // const uint64_t coeffs_size = 1 << (rand() % 10 + 4); - // const uint64_t domain_size = 1 << (rand() % 8 + 2); - // const int batch_size = 1 << (rand() % 5); - // const bool columns_batch = rand() % 2; + const uint64_t coeffs_size = 1 << (rand() % 10 + 4); + const uint64_t domain_size = 1 << (rand() % 8 + 2); + const int batch_size = 1 << (rand() % 5); + const bool columns_batch = rand() % 2; + // const bool a_on_device = rand() % 2; + // const bool b_on_device = rand() % 2; + // const bool res_on_device = rand() % 2; - const uint64_t coeffs_size = 3; - const uint64_t domain_size = 4; - const int batch_size = 1; - const bool columns_batch = 0; + // const uint64_t coeffs_size = 3; + // const uint64_t domain_size = 4; + // const int batch_size = 1; + // const bool columns_batch = 0; ICICLE_LOG_DEBUG << "coeffs_size = " << coeffs_size; ICICLE_LOG_DEBUG << "domain_size = " << domain_size; @@ -790,18 +793,27 @@ TYPED_TEST(FieldApiTest, polynomialEval) ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; const int total_coeffs_size = coeffs_size * batch_size; + const int total_result_size = domain_size * batch_size; auto in_coeffs = std::make_unique(total_coeffs_size); auto in_domain = std::make_unique(domain_size); - auto out_main = std::make_unique(total_coeffs_size); - auto out_ref = std::make_unique(total_coeffs_size); + auto out_main = std::make_unique(total_result_size); + auto out_ref = std::make_unique(total_result_size); auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) { Device dev = {dev_type, 0}; icicle_set_device(dev); auto config = default_vec_ops_config(); config.batch_size = batch_size; - config.columns_batch = columns_batch; + // config.is_a_on_device = a_on_device; + // config.is_b_on_device = b_on_device; + // config.is_result_on_device = res_on_device; + + // if (dev_type == "CUDA") { + // in_coeffs = config.is_a_on_device ? allocate_and_copy_to_device(in_coeffs, total_coeffs_size * sizeof(E), cuda_stream) : in_coeffs; + // in_domain = config.is_b_on_device ? allocate_and_copy_to_device(in_domain, domain_size * sizeof(E), cuda_stream) : in_domain; + // out = config.is_result_on_device ? allocate_and_copy_to_device(out, total_result_size * sizeof(E), cuda_stream) : out; + // } std::ostringstream oss; oss << dev_type << " " << msg; @@ -811,6 +823,12 @@ TYPED_TEST(FieldApiTest, polynomialEval) ICICLE_CHECK(polynomial_eval(in_coeffs.get(), coeffs_size, in_domain.get(), domain_size, config, out)); } END_TIMER(polynomialEval, oss.str().c_str(), measure); + + // if (dev_type == "CUDA") { + // if (config.is_a_on_device) cudaFree(in_coeffs); + // if (config.is_b_on_device) cudaFree(in_domain); + // if (config.is_result_on_device) cudaFree(out); + // } }; FieldApiTest::random_samples(in_coeffs.get(), total_coeffs_size); @@ -821,7 +839,7 @@ TYPED_TEST(FieldApiTest, polynomialEval) run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "polynomial_eval", 1); if (s_is_cuda_registered) { run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "polynomial_eval", 1); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_coeffs_size * sizeof(TypeParam))); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_result_size * sizeof(TypeParam))); } } From baf3eb2e8585ae118b027aab299b323b81f7e932 Mon Sep 17 00:00:00 2001 From: Yuval Shekel Date: Sun, 3 Nov 2024 17:46:03 +0200 Subject: [PATCH 31/43] removed vec-ops example - doesn't compile and very similar to other examples --- .../c++/vector-api/.devcontainer/Dockerfile | 25 --- .../.devcontainer/devcontainer.json | 22 --- examples/c++/vector-api/CMakeLists.txt | 16 -- examples/c++/vector-api/README.md | 28 ---- examples/c++/vector-api/example.cpp | 142 ------------------ examples/c++/vector-api/run.sh | 66 -------- 6 files changed, 299 deletions(-) delete mode 100644 examples/c++/vector-api/.devcontainer/Dockerfile delete mode 100644 examples/c++/vector-api/.devcontainer/devcontainer.json delete mode 100644 examples/c++/vector-api/CMakeLists.txt delete mode 100644 examples/c++/vector-api/README.md delete mode 100644 examples/c++/vector-api/example.cpp delete mode 100755 examples/c++/vector-api/run.sh diff --git a/examples/c++/vector-api/.devcontainer/Dockerfile b/examples/c++/vector-api/.devcontainer/Dockerfile deleted file mode 100644 index 64188da96..000000000 --- a/examples/c++/vector-api/.devcontainer/Dockerfile +++ /dev/null @@ -1,25 +0,0 @@ -# Make sure NVIDIA Container Toolkit is installed on your host - -# Use the specified base image -FROM nvidia/cuda:12.0.0-devel-ubuntu22.04 - -# Update and install dependencies -RUN apt-get update && apt-get install -y \ - cmake \ - curl \ - build-essential \ - git \ - libboost-all-dev \ - && rm -rf /var/lib/apt/lists/* - -# Clone Icicle from a GitHub repository -RUN git clone https://github.com/ingonyama-zk/icicle.git /icicle - -# Set the working directory in the container -WORKDIR /icicle-example - -# Specify the default command for the container -CMD ["/bin/bash"] - - - diff --git a/examples/c++/vector-api/.devcontainer/devcontainer.json b/examples/c++/vector-api/.devcontainer/devcontainer.json deleted file mode 100644 index 490fe90a6..000000000 --- a/examples/c++/vector-api/.devcontainer/devcontainer.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "name": "Icicle Examples: polynomial multiplication", - "build": { - "dockerfile": "Dockerfile" - }, - "runArgs": [ - "--gpus", - "all" - ], - "postCreateCommand": [ - "nvidia-smi" - ], - "customizations": { - "vscode": { - "extensions": [ - "ms-vscode.cmake-tools", - "ms-python.python", - "ms-vscode.cpptools" - ] - } - } -} \ No newline at end of file diff --git a/examples/c++/vector-api/CMakeLists.txt b/examples/c++/vector-api/CMakeLists.txt deleted file mode 100644 index c32f17f43..000000000 --- a/examples/c++/vector-api/CMakeLists.txt +++ /dev/null @@ -1,16 +0,0 @@ -cmake_minimum_required(VERSION 3.18) - -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_STANDARD_REQUIRED TRUE) - -project(example) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") -add_executable(example example.cpp) -target_include_directories(example PRIVATE "../../../icicle/include" "..") -target_link_directories(example PRIVATE "${CMAKE_SOURCE_DIR}/build/icicle") -message("${CMAKE_BINARY_DIR}/icicle") -target_link_libraries(example PRIVATE icicle_curve_bn254 icicle_field_bn254 icicle_device) -if(BACKEND_DIR) - add_compile_definitions(BACKEND_DIR="${BACKEND_DIR}") -endif() - diff --git a/examples/c++/vector-api/README.md b/examples/c++/vector-api/README.md deleted file mode 100644 index 120156c9f..000000000 --- a/examples/c++/vector-api/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# Icicle Example: Vector Operations API - -## Key-Takeaway - -The Vector Operations API supports the following: - - - element-wise vector operations (e.g. addition, multiplication) - - vector reduction operations (e.g. sum of elements, product of elements) - - scalar-vector operations (e.g add scalar to vector) - - matrix operations (e.g. transposition) - - miscellaneous operations like bit-reversal and slicing. - - All these operations can be performed on a host or device both synchronously and asynchronously. - -## Running the example - -```sh -# for CPU -./run.sh -d CPU -# for CUDA -./run.sh -d CUDA -b /path/to/cuda/backend/install/dir -``` - -## What's in the example - -1. `example_element_wise`: examples of element-wise operations -2. `example_scalar_vector`: examples of scalar-vector operations - diff --git a/examples/c++/vector-api/example.cpp b/examples/c++/vector-api/example.cpp deleted file mode 100644 index 5c4497a64..000000000 --- a/examples/c++/vector-api/example.cpp +++ /dev/null @@ -1,142 +0,0 @@ -#include -#include -#include - -#include "icicle/runtime.h" -#include "icicle/api/bn254.h" -#include "icicle/utils/log.h" - - -// SP: I understand this code is auto-generated, but I can't get script/gen to work. - -extern "C" eIcicleError bn254_vector_product( - const bn254::scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result, uint64_t offset, uint64_t stride); - -extern "C" eIcicleError bn254_vector_sum( - const bn254::scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result, uint64_t offset, uint64_t stride); - -// SP: end of my changes - -using namespace bn254; - -#include "examples_utils.h" - -void random_samples(scalar_t* res, uint32_t count) -{ - for (int i = 0; i < count; i++) - res[i] = i < 1000 ? scalar_t::rand_host() : res[i - 1000]; -} - -void incremental_values(scalar_t* res, uint32_t count) -{ - for (int i = 0; i < count; i++) { - res[i] = i ? res[i - 1] + scalar_t::one() : scalar_t::one(); - } -} - - -void example_element_wise() { - return; -} - -int main(int argc, char** argv) -{ - // try_load_and_set_backend_device(argc, argv); - - int N_LOG = 20; - int N = 1 << N_LOG; - int offset = 1; - int stride = 4; - - // on-host data - auto h_a = std::make_unique(N); - auto h_b = std::make_unique(N); - auto h_out = std::make_unique(N); - - random_samples(h_a.get(), N ); - random_samples(h_b.get(), N ); - - // incremental_values(h_a.get(), N ); - // incremental_values(h_b.get(), N ); - - // on-device data - scalar_t *d_a, *d_b, *d_out; - - DeviceProperties device_props; - ICICLE_CHECK(icicle_get_device_properties(device_props)); - if (!device_props.using_host_memory) { - std::cout << "Device isn't using host memory" << std::endl; - } else { - std::cout << "Device is using host memory" << std::endl; - } - - ICICLE_CHECK(icicle_malloc((void**)&d_a, sizeof(scalar_t) * N)); - ICICLE_CHECK(icicle_malloc((void**)&d_b, sizeof(scalar_t) * N)); - ICICLE_CHECK(icicle_malloc((void**)&d_out, sizeof(scalar_t) * N)); - - ICICLE_CHECK(icicle_copy(d_a, h_a.get(), sizeof(scalar_t) * N)); - ICICLE_CHECK(icicle_copy(d_b, h_b.get(), sizeof(scalar_t) * N)); - - VecOpsConfig h_config{ - nullptr, - false, // is_a_on_device - false, // is_b_on_device - false, // is_result_on_device - false, // is_async - nullptr // ext - }; - - VecOpsConfig d_config{ - nullptr, - true, // is_a_on_device - true, // is_b_on_device - true, // is_result_on_device - false, // is_async - nullptr // ext - }; - - - // Reduction operations - - START_TIMER(baseline_reduce_sum); - h_out[0] = scalar_t::zero(); - for (uint64_t i = offset; i < N; i=i+stride) { - h_out[0] = h_out[0] + h_a[i]; - } - END_TIMER(baseline_reduce_sum, "baseline reduce sum took"); - - START_TIMER(reduce_sum); - ICICLE_CHECK(bn254_vector_sum(d_a, N, &h_config, d_out, offset, stride)); - END_TIMER(reduce_sum, "reduce sum took"); - - - std::cout << "h_out: " << h_out[0] << std::endl; - std::cout << "d_out: " << d_out[0] << std::endl; - - - START_TIMER(baseline_reduce_product); - h_out[0] = scalar_t::one(); - for (uint64_t i = offset; i < N; i = i + stride) { - h_out[0] = h_out[0] * h_a[i]; - } - END_TIMER(baseline_reduce_product, "baseline reduce product took"); - - - START_TIMER(reduce_product); - ICICLE_CHECK(bn254_vector_product(d_a, N, &d_config, d_out, offset, stride)); - END_TIMER(reduce_product, "reduce product took"); - - - std::cout << "h_out: " << h_out[0] << std::endl; - std::cout << "d_out: " << d_out[0] << std::endl; - - - - - - ICICLE_CHECK(icicle_free(d_a)); - ICICLE_CHECK(icicle_free(d_b)); - ICICLE_CHECK(icicle_free(d_out)); - - return 0; -} \ No newline at end of file diff --git a/examples/c++/vector-api/run.sh b/examples/c++/vector-api/run.sh deleted file mode 100755 index 879390d0a..000000000 --- a/examples/c++/vector-api/run.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -# Exit immediately if a command exits with a non-zero status -set -e - -# Function to display usage information -show_help() { - echo "Usage: $0 [-d DEVICE_TYPE] [-b ICICLE_BACKEND_INSTALL_DIR]" - echo - echo "Options:" - echo " -d DEVICE_TYPE Specify the device type (default: CPU)" - echo " -b ICICLE_BACKEND_INSTALL_DIR Specify the backend installation directory (default: empty)" - echo " -h Show this help message" - exit 0 -} - -# Parse command line options -while getopts ":d:b:h" opt; do - case ${opt} in - d ) - DEVICE_TYPE=$OPTARG - ;; - b ) - ICICLE_BACKEND_INSTALL_DIR="$(realpath ${OPTARG})" - ;; - h ) - show_help - ;; - \? ) - echo "Invalid option: -$OPTARG" 1>&2 - show_help - ;; - : ) - echo "Invalid option: -$OPTARG requires an argument" 1>&2 - show_help - ;; - esac -done - -# Set default values if not provided -: "${DEVICE_TYPE:=CPU}" -: "${ICICLE_BACKEND_INSTALL_DIR:=}" - -# Create necessary directories -mkdir -p build/example -mkdir -p build/icicle - -ICILE_DIR=$(realpath "../../../icicle/") -ICICLE_CUDA_SOURCE_DIR="${ICILE_DIR}/backend/cuda" - -# Build Icicle and the example app that links to it -if [ "$DEVICE_TYPE" == "CUDA" ] && [ ! -d "${ICICLE_BACKEND_INSTALL_DIR}" ] && [ -d "${ICICLE_CUDA_SOURCE_DIR}" ]; then - echo "Building icicle with CUDA backend" - cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DMSM=OFF -DG2=OFF -DECNTT=OFF -DCUDA_BACKEND=local -S "${ICILE_DIR}" -B build/icicle - export ICICLE_BACKEND_INSTALL_DIR=$(realpath "build/icicle/backend") -else - echo "Building icicle without CUDA backend, ICICLE_BACKEND_INSTALL_DIR=${ICICLE_BACKEND_INSTALL_DIR}" - export ICICLE_BACKEND_INSTALL_DIR="${ICICLE_BACKEND_INSTALL_DIR}" - cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -S "${ICILE_DIR}" -B build/icicle -fi -cmake -DCMAKE_BUILD_TYPE=Release -S . -B build/example - -cmake --build build/icicle -j -cmake --build build/example -j - -./build/example/example "$DEVICE_TYPE" From 2ed43696dbe970d35de9638eb264be18f32d23f8 Mon Sep 17 00:00:00 2001 From: Yuval Shekel Date: Sun, 3 Nov 2024 19:55:50 +0200 Subject: [PATCH 32/43] updated poly-div test and poly-eval fix for column mode --- icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 6 +- icicle/include/icicle/vec_ops.h | 8 +- icicle/tests/test_field_api.cpp | 230 ++++++------------- 3 files changed, 73 insertions(+), 171 deletions(-) diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index 24e53fa59..0e3e7c2d5 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -902,11 +902,7 @@ eIcicleError cpu_poly_divide( ICICLE_ASSERT(q_size >= (numerator_deg - denumerator_deg + 1)) << "polynomial division expects q(x) size to be at least deg(numerator)-deg(denumerator)+1"; - // ICICLE_CHECK(icicle_copy_async(r_out, numerator, r_size * config.batch_size * sizeof(T), config.stream)); - // copy numerator to r_out // FIXME should it be copied using icicle_copy_async? - for (uint64_t i = 0; i < (numerator_deg + 1) * config.batch_size; ++i) { - r_out[i] = numerator[i]; - } + memcpy(r_out, numerator, sizeof(T) * numerator_size * config.batch_size); uint32_t stride = config.columns_batch ? config.batch_size : 1; auto deg_r = std::make_unique(config.batch_size); diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h index 2868aa682..cc317783b 100644 --- a/icicle/include/icicle/vec_ops.h +++ b/icicle/include/icicle/vec_ops.h @@ -397,11 +397,11 @@ namespace icicle { * - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored * contiguously. * - If `config.columns_batch` is `true`, coefficients are interleaved. - * @param numerator_deg Degree of the numerator polynomial. + * @param numerator_max_deg Maximal degree of the numerator polynomials in the batch. * @param numerator_size size (number of T elements) in numerator vec * @param denominator Pointer to the array of coefficients of the denominator polynomial(s). * - Storage layout is similar to `numerator`. - * @param denominator_deg Degree of the denominator polynomial. + * @param denumerator_max_deg Maximal degree of the denominator polynomials in the batch. * @param denominator_size size (number of T elements) in denumerator vec * @param config Configuration for the operation. * @param q_size Size of the quotient array for one polynomial. @@ -420,10 +420,10 @@ namespace icicle { template eIcicleError polynomial_division( const T* numerator, - int64_t numerator_deg, + int64_t numerator_max_deg, uint64_t numerator_size, const T* denumerator, - int64_t denumerator_deg, + int64_t denumerator_max_deg, uint64_t denumerator_size, uint64_t q_size, uint64_t r_size, diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index ce54247a3..62c3a75bf 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -17,7 +17,7 @@ using namespace field_config; using namespace icicle; -// TODO - add tests that test different configurations of data on device or on host. +// TODO Hadar - add tests that test different configurations of data on device or on host. using FpMicroseconds = std::chrono::duration; #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now(); @@ -30,7 +30,8 @@ static bool VERBOSE = true; static int ITERS = 1; static inline std::string s_main_target; static inline std::string s_reference_target; -bool s_is_cuda_registered; +static inline std::vector s_registered_devices; +bool s_is_cuda_registered; // TODO Yuval remove this template class FieldApiTest : public ::testing::Test @@ -48,6 +49,7 @@ class FieldApiTest : public ::testing::Test if (!s_is_cuda_registered) { ICICLE_LOG_ERROR << "CUDA device not found. Testing CPU vs reference (on cpu)"; } s_main_target = s_is_cuda_registered ? "CUDA" : "CPU"; s_reference_target = "CPU"; + s_registered_devices = get_registered_devices_list(); } static void TearDownTestSuite() { @@ -436,9 +438,11 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync) srand(seed); ICICLE_LOG_DEBUG << "seed = " << seed; const int R = - 1 << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes which are not powers of 2 + 1 + << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes which are not powers of 2 const int C = - 1 << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes which are not powers of 2 + 1 + << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes which are not powers of 2 const int batch_size = 1 << (rand() % 4); const bool columns_batch = rand() % 2; const bool is_in_place = @@ -727,11 +731,12 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx) const bool columns_batch = rand() % 2; const int total_size = N * batch_size; - ICICLE_LOG_DEBUG << "N = " << N; - ICICLE_LOG_DEBUG << "batch_size = " << batch_size; - ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; - auto in_a = std::make_unique(total_size); + for (int i = 0; i < batch_size; ++i) { + // randomize different rows with zeros in the end + auto size = std::max(int64_t(N) / 4 - i, int64_t(1)); + scalar_t::rand_host_many(in_a.get() + i * N, size); + } auto out_main = std::make_unique(batch_size); auto out_ref = std::make_unique(batch_size); @@ -752,20 +757,8 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx) END_TIMER(highestNonZeroIdx, oss.str().c_str(), measure); }; - // Initialize each entire vector with 1 at a random index. The highest non-zero index is the index with 1 - for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { - out_ref[idx_in_batch] = static_cast(rand() % N); // highest_non_zero_idx - for (uint32_t i = 0; i < N; i++) { - if (columns_batch) { - in_a[idx_in_batch + batch_size * i] = TypeParam::from(i == out_ref[idx_in_batch] ? 1 : 0); - } else { - in_a[idx_in_batch * N + i] = TypeParam::from(i == out_ref[idx_in_batch] ? 1 : 0); - } - } - } - if (s_is_cuda_registered) { run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "highest_non_zero_idx", 1); } + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "highest_non_zero_idx", 1); run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "highest_non_zero_idx", 1); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(int64_t))); } @@ -778,14 +771,6 @@ TYPED_TEST(FieldApiTest, polynomialEval) const uint64_t domain_size = 1 << (rand() % 8 + 2); const int batch_size = 1 << (rand() % 5); const bool columns_batch = rand() % 2; - // const bool a_on_device = rand() % 2; - // const bool b_on_device = rand() % 2; - // const bool res_on_device = rand() % 2; - - // const uint64_t coeffs_size = 3; - // const uint64_t domain_size = 4; - // const int batch_size = 1; - // const bool columns_batch = 0; ICICLE_LOG_DEBUG << "coeffs_size = " << coeffs_size; ICICLE_LOG_DEBUG << "domain_size = " << domain_size; @@ -805,15 +790,7 @@ TYPED_TEST(FieldApiTest, polynomialEval) icicle_set_device(dev); auto config = default_vec_ops_config(); config.batch_size = batch_size; - // config.is_a_on_device = a_on_device; - // config.is_b_on_device = b_on_device; - // config.is_result_on_device = res_on_device; - - // if (dev_type == "CUDA") { - // in_coeffs = config.is_a_on_device ? allocate_and_copy_to_device(in_coeffs, total_coeffs_size * sizeof(E), cuda_stream) : in_coeffs; - // in_domain = config.is_b_on_device ? allocate_and_copy_to_device(in_domain, domain_size * sizeof(E), cuda_stream) : in_domain; - // out = config.is_result_on_device ? allocate_and_copy_to_device(out, total_result_size * sizeof(E), cuda_stream) : out; - // } + config.columns_batch = columns_batch; std::ostringstream oss; oss << dev_type << " " << msg; @@ -823,138 +800,67 @@ TYPED_TEST(FieldApiTest, polynomialEval) ICICLE_CHECK(polynomial_eval(in_coeffs.get(), coeffs_size, in_domain.get(), domain_size, config, out)); } END_TIMER(polynomialEval, oss.str().c_str(), measure); - - // if (dev_type == "CUDA") { - // if (config.is_a_on_device) cudaFree(in_coeffs); - // if (config.is_b_on_device) cudaFree(in_domain); - // if (config.is_result_on_device) cudaFree(out); - // } }; FieldApiTest::random_samples(in_coeffs.get(), total_coeffs_size); FieldApiTest::random_samples(in_domain.get(), domain_size); - // Reference implementation - TODO - run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "polynomial_eval", 1); - if (s_is_cuda_registered) { - run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "polynomial_eval", 1); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_result_size * sizeof(TypeParam))); - } + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "polynomial_eval", 1); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_result_size * sizeof(TypeParam))); } -// TYPED_TEST(FieldApiTest, polynomialDivision) -// { -// int seed = time(0); -// srand(seed); -// ICICLE_LOG_DEBUG << "seed = " << seed; -// // const int64_t numerator_deg = 1 << 4; -// // const int64_t denumerator_deg = 1 << 2; -// // const uint64_t q_size = numerator_deg - denumerator_deg + 1; -// // const uint64_t r_size = numerator_deg + 1; -// const int64_t numerator_deg = 3; -// const int64_t denumerator_deg = 2; -// const uint64_t q_size = 2; -// const uint64_t r_size = 4; -// // const int batch_size = 1 << (rand() % 5); -// const int batch_size = 1; -// const bool columns_batch = rand() % 2; - -// const int64_t total_numerator_size = (numerator_deg + 1) * batch_size; -// const int64_t total_denumerator_size = (denumerator_deg + 1) * batch_size; -// const uint64_t total_q_size = q_size * batch_size; -// const uint64_t total_r_size = r_size * batch_size; - -// auto numerator = std::make_unique(total_numerator_size); -// auto denumerator = std::make_unique(total_denumerator_size); -// auto q_out_main = std::make_unique(total_q_size); -// auto r_out_main = std::make_unique(total_r_size); -// auto q_out_ref = std::make_unique(total_q_size); -// auto r_out_ref = std::make_unique(total_r_size); - -// auto run = -// [&](const std::string& dev_type, TypeParam* q_out, TypeParam* r_out, bool measure, const char* msg, int iters) { -// Device dev = {dev_type, 0}; -// icicle_set_device(dev); -// auto config = default_vec_ops_config(); -// config.batch_size = batch_size; -// config.columns_batch = columns_batch; - -// std::ostringstream oss; -// oss << dev_type << " " << msg; - -// START_TIMER(polynomialDivision) -// for (int i = 0; i < iters; ++i) { -// ICICLE_CHECK(polynomial_division( -// numerator.get(), numerator_deg, total_numerator_size, denumerator.get(), denumerator_deg, -// total_denumerator_size, q_size, r_size, config, q_out, r_out)); -// } -// END_TIMER(polynomialDivision, oss.str().c_str(), measure); -// }; - -// // // Option 1: Initialize input vectors with random values -// // FieldApiTest::random_samples(numerator.get(), total_numerator_size); -// // FieldApiTest::random_samples(denumerator.get(), total_denumerator_size); -// // // Reference implementation -// // TODO - Check in comperison with GPU implementation or implement a general reference implementation - -// // Option 2: Initialize the numerator and denumerator with chosen example -// // And the reference implementation for the example - -// for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { -// if (columns_batch) { -// // numerator = 3x^3+4x^2+5 -// numerator[idx_in_batch + 0 * batch_size] = TypeParam::from(5); -// numerator[idx_in_batch + 1 * batch_size] = TypeParam::from(0); -// numerator[idx_in_batch + 2 * batch_size] = TypeParam::from(4); -// numerator[idx_in_batch + 3 * batch_size] = TypeParam::from(3); -// // denumerator = x^2-1 -// denumerator[idx_in_batch + 0 * batch_size] = TypeParam::from(0) - TypeParam::from(1); -// denumerator[idx_in_batch + 1 * batch_size] = TypeParam::from(0); -// denumerator[idx_in_batch + 2 * batch_size] = TypeParam::from(1); -// if (!s_is_cuda_registered) { -// // q_out_ref = 3x+4 -// q_out_ref[idx_in_batch + 0 * batch_size] = TypeParam::from(4); -// q_out_ref[idx_in_batch + 1 * batch_size] = TypeParam::from(3); -// // r_out_ref = 3x+9 -// r_out_ref[idx_in_batch + 0 * batch_size] = TypeParam::from(9); -// r_out_ref[idx_in_batch + 1 * batch_size] = TypeParam::from(3); -// } -// } else { -// // numerator = 3x^3+4x^2+5 -// numerator[idx_in_batch * (numerator_deg + 1) + 0] = TypeParam::from(5); -// numerator[idx_in_batch * (numerator_deg + 1) + 1] = TypeParam::from(0); -// numerator[idx_in_batch * (numerator_deg + 1) + 2] = TypeParam::from(4); -// numerator[idx_in_batch * (numerator_deg + 1) + 3] = TypeParam::from(3); -// // denumerator = x^2-1 -// denumerator[idx_in_batch * (denumerator_deg + 1) + 0] = TypeParam::from(0) - TypeParam::from(1); -// denumerator[idx_in_batch * (denumerator_deg + 1) + 1] = TypeParam::from(0); -// denumerator[idx_in_batch * (denumerator_deg + 1) + 2] = TypeParam::from(1); -// if (!s_is_cuda_registered) { -// // q_out_ref = 3x+4 -// q_out_ref[idx_in_batch * q_size + 0] = TypeParam::from(4); -// q_out_ref[idx_in_batch * q_size + 1] = TypeParam::from(3); -// // r_out_ref = 3x+9 -// r_out_ref[idx_in_batch * r_size + 0] = TypeParam::from(9); -// r_out_ref[idx_in_batch * r_size + 1] = TypeParam::from(3); -// } -// } -// } - -// if (s_is_cuda_registered) { -// run(s_reference_target, q_out_ref.get(), r_out_ref.get(), VERBOSE /*=measure*/, "polynomial_division", 1); -// } -// std::cout << "numerator:\t["; for (int i = 0; i < total_numerator_size-1; i++) { std::cout << numerator[i] << ", "; -// } std::cout <(numerator_size * batch_size); + auto denumerator = std::make_unique(denumerator_size * batch_size); + TypeParam::rand_host_many(numerator.get(), numerator_size * batch_size); + TypeParam::rand_host_many(denumerator.get(), denumerator_size * batch_size); + + for (auto device : s_registered_devices) { + ICICLE_CHECK(icicle_set_device(device)); + for (int columns_batch = 0; columns_batch <= 1; columns_batch++) { + ICICLE_LOG_DEBUG << "testing polynomial division on device " << device << " [column_batch=" << columns_batch + << "]"; + auto q = std::make_unique(q_size * batch_size); + auto r = std::make_unique(r_size * batch_size); + + auto config = default_vec_ops_config(); + config.batch_size = batch_size; + config.columns_batch = columns_batch; + ICICLE_CHECK(polynomial_division( + numerator.get(), max_num_deg, numerator_size, denumerator.get(), max_denum_deg, denumerator_size, q_size, + r_size, config, q.get(), r.get())); + + // test a(x)=q(x)b(x)+r(x) in random point + const auto rand_x = TypeParam::rand_host(); + auto ax = std::make_unique(batch_size); + auto bx = std::make_unique(batch_size); + auto qx = std::make_unique(batch_size); + auto rx = std::make_unique(batch_size); + polynomial_eval(numerator.get(), numerator_size, &rand_x, 1, config, ax.get()); + polynomial_eval(denumerator.get(), denumerator_size, &rand_x, 1, config, bx.get()); + polynomial_eval(q.get(), q_size, &rand_x, 1, config, qx.get()); + polynomial_eval(r.get(), r_size, &rand_x, 1, config, rx.get()); + + for (int i = 0; i < batch_size; ++i) { + // ICICLE_LOG_DEBUG << "ax=" << ax[i] << ", bx=" << bx[i] << ", qx=" << qx[i] << ", rx=" << rx[i]; + ASSERT_EQ(ax[i], qx[i] * bx[i] + rx[i]); + } + } + } +} #ifdef NTT From b7d62c8910c13bc9085de5514a4e2f78e8b07c8a Mon Sep 17 00:00:00 2001 From: Yuval Shekel Date: Mon, 4 Nov 2024 17:35:26 +0200 Subject: [PATCH 33/43] updated for poly div --- icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 59 ++++++++++--------- .../include/icicle/backend/vec_ops_backend.h | 12 ++-- .../default_backend/default_poly_backend.h | 2 +- icicle/include/icicle/vec_ops.h | 30 +++------- icicle/src/vec_ops.cpp | 50 ++++------------ icicle/tests/test_field_api.cpp | 45 +++++++++----- 6 files changed, 84 insertions(+), 114 deletions(-) diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index 0e3e7c2d5..b22c1ade2 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -886,43 +886,48 @@ template eIcicleError cpu_poly_divide( const Device& device, const T* numerator, - int64_t numerator_deg, uint64_t numerator_size, - const T* denumerator, - int64_t denumerator_deg, - uint64_t denumerator_size, - uint64_t q_size, - uint64_t r_size, + const T* denominator, + uint64_t denominator_size, const VecOpsConfig& config, T* q_out /*OUT*/, - T* r_out /*OUT*/) + uint64_t q_size, + T* r_out /*OUT*/, + uint64_t r_size) { - ICICLE_ASSERT(r_size >= numerator_deg) - << "polynomial division expects r(x) size to be similar to numerator size and higher than numerator degree(x)"; - ICICLE_ASSERT(q_size >= (numerator_deg - denumerator_deg + 1)) - << "polynomial division expects q(x) size to be at least deg(numerator)-deg(denumerator)+1"; - - memcpy(r_out, numerator, sizeof(T) * numerator_size * config.batch_size); + if (config.batch_size != 1 && config.columns_batch) { + ICICLE_LOG_ERROR << "polynomial division is not implemented for column batch. Planned for v3.2"; + return eIcicleError::API_NOT_IMPLEMENTED; + } uint32_t stride = config.columns_batch ? config.batch_size : 1; - auto deg_r = std::make_unique(config.batch_size); for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) { - const T* curr_denumerator = config.columns_batch - ? denumerator + idx_in_batch - : denumerator + idx_in_batch * (denumerator_deg + 1); // Pointer to the current vector - T* curr_q_out = - config.columns_batch ? q_out + idx_in_batch : q_out + idx_in_batch * q_size; // Pointer to the current vector - T* curr_r_out = - config.columns_batch ? r_out + idx_in_batch : r_out + idx_in_batch * r_size; // Pointer to the current vector + const T* curr_numerator = + config.columns_batch ? numerator + idx_in_batch : numerator + idx_in_batch * numerator_size; + const T* curr_denominator = + config.columns_batch ? denominator + idx_in_batch : denominator + idx_in_batch * denominator_size; + T* curr_q_out = config.columns_batch ? q_out + idx_in_batch : q_out + idx_in_batch * q_size; + T* curr_r_out = config.columns_batch ? r_out + idx_in_batch : r_out + idx_in_batch * r_size; + + int64_t numerator_deg, denominator_deg; + cpu_highest_non_zero_idx(device, curr_numerator, numerator_size, default_vec_ops_config(), &numerator_deg); + cpu_highest_non_zero_idx(device, curr_denominator, denominator_size, default_vec_ops_config(), &denominator_deg); + ICICLE_ASSERT(r_size >= numerator_deg + 1) + << "polynomial division expects r(x) size to be similar to numerator size and higher than numerator degree(x)"; + ICICLE_ASSERT(q_size >= (numerator_deg - denominator_deg + 1)) + << "polynomial division expects q(x) size to be at least deg(numerator)-deg(denominator)+1"; + + memset(curr_r_out, 0, sizeof(T) * r_size); + memcpy(curr_r_out, curr_numerator, sizeof(T) * (numerator_deg + 1)); + // invert largest coeff of b - const T& lc_b_inv = T::inverse(curr_denumerator[denumerator_deg * stride]); - deg_r[idx_in_batch] = numerator_deg; - while (deg_r[idx_in_batch] >= denumerator_deg) { + const T& lc_b_inv = T::inverse(curr_denominator[denominator_deg * stride]); + int64_t deg_r = numerator_deg; + while (deg_r >= denominator_deg) { // each iteration is removing the largest monomial in r until deg(r); + uint64_t q_size, + scalar_t* r_out /*OUT*/, + uint64_t r_size)>; void register_vector_add(const std::string& deviceType, vectorVectorOpImpl impl); diff --git a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h index a42c87317..bfa57f9c3 100644 --- a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h +++ b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h @@ -278,7 +278,7 @@ namespace icicle { config.is_result_on_device = true; ICICLE_CHECK(icicle::polynomial_division( - a_coeffs, deg_a, a_N, b_coeffs, deg_b, b_N, deg_a - deg_b + 1, a_N, config, Q_coeffs, R_coeffs)); + a_coeffs, deg_a + 1, b_coeffs, deg_b + 1, config, Q_coeffs, deg_a - deg_b + 1, R_coeffs, a_N)); } void quotient(PolyContext Q, PolyContext op_a, PolyContext op_b) override diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h index cc317783b..38551ab6a 100644 --- a/icicle/include/icicle/vec_ops.h +++ b/icicle/include/icicle/vec_ops.h @@ -397,47 +397,31 @@ namespace icicle { * - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored * contiguously. * - If `config.columns_batch` is `true`, coefficients are interleaved. - * @param numerator_max_deg Maximal degree of the numerator polynomials in the batch. - * @param numerator_size size (number of T elements) in numerator vec + * @param numerator_size size (number of T elements) in numerator vec of a single batch element * @param denominator Pointer to the array of coefficients of the denominator polynomial(s). * - Storage layout is similar to `numerator`. - * @param denumerator_max_deg Maximal degree of the denominator polynomials in the batch. - * @param denominator_size size (number of T elements) in denumerator vec + * @param denominator_size size (number of T elements) in denominator vec of a single batch element * @param config Configuration for the operation. - * @param q_size Size of the quotient array for one polynomial. - * @param r_size Size of the remainder array. * @param q_out Pointer to the array where the quotient polynomial(s) will be stored. This is an output parameter. * - The storage layout should match that of `numerator`. + * @param q_size Size of the quotient array for one polynomial. * @param r_out Pointer to the array where the remainder polynomial(s) will be stored. This is an output parameter. * - The storage layout should match that of `numerator`. * - The size of `r_out` should be sufficient to hold the remainder coefficients for each polynomial. + * @param r_size Size of the remainder array. * @return eIcicleError Error code indicating success or failure. * * @note The degrees should satisfy `numerator_deg >= denominator_deg`. * The sizes `q_size` and `r_size` must be at least `numerator_deg - denominator_deg + 1` and `denominator_deg`, * respectively. The function assumes that the input and output arrays are properly allocated. */ - template - eIcicleError polynomial_division( - const T* numerator, - int64_t numerator_max_deg, - uint64_t numerator_size, - const T* denumerator, - int64_t denumerator_max_deg, - uint64_t denumerator_size, - uint64_t q_size, - uint64_t r_size, - const VecOpsConfig& config, - T* q_out /*OUT*/, - T* r_out /*OUT*/); - // deprecated API template eIcicleError polynomial_division( const T* numerator, - int64_t numerator_deg, - const T* denumerator, - int64_t denumerator_deg, + uint64_t numerator_size, + const T* denominator, + uint64_t denominator_size, const VecOpsConfig& config, T* q_out /*OUT*/, uint64_t q_size, diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp index c8b867470..ebb86e0c1 100644 --- a/icicle/src/vec_ops.cpp +++ b/icicle/src/vec_ops.cpp @@ -419,63 +419,33 @@ namespace icicle { extern "C" eIcicleError CONCAT_EXPAND(FIELD, poly_division)( const scalar_t* numerator, - int64_t numerator_deg, uint64_t numerator_size, - const scalar_t* denumerator, - int64_t denumerator_deg, - uint64_t denumerator_size, - uint64_t q_size, - uint64_t r_size, + const scalar_t* denominator, + int64_t denominator_size, const VecOpsConfig& config, scalar_t* q_out /*OUT*/, - scalar_t* r_out /*OUT*/) + uint64_t q_size, + scalar_t* r_out /*OUT*/, + uint64_t r_size) { return ScalarPolyDivDispatcher::execute( - numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, config, - q_out, r_out); + numerator, numerator_size, denominator, denominator_size, config, q_out, q_size, r_out, r_size); } template <> eIcicleError polynomial_division( const scalar_t* numerator, - int64_t numerator_deg, uint64_t numerator_size, - const scalar_t* denumerator, - int64_t denumerator_deg, - uint64_t denumerator_size, - uint64_t q_size, - uint64_t r_size, - const VecOpsConfig& config, - scalar_t* q_out /*OUT*/, - scalar_t* r_out /*OUT*/) - { - return CONCAT_EXPAND(FIELD, poly_division)( - numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, config, - q_out, r_out); - } - - // Deprecated API - template <> - eIcicleError polynomial_division( - const scalar_t* numerator, - int64_t numerator_deg, - const scalar_t* denumerator, - int64_t denumerator_deg, + const scalar_t* denominator, + uint64_t denominator_size, const VecOpsConfig& config, scalar_t* q_out /*OUT*/, uint64_t q_size, scalar_t* r_out /*OUT*/, uint64_t r_size) { - ICICLE_LOG_WARNING - << "polynomial_division api is deprecated and replace with new api. Use new polynomial_division api instead"; - if (config.batch_size != 1) { - ICICLE_LOG_ERROR << "deprecated polynomial_division API does not support batch"; - return eIcicleError::INVALID_ARGUMENT; - } - return polynomial_division( - numerator, numerator_deg, numerator_deg + 1, denumerator, denumerator_deg, denumerator_deg + 1, q_size, r_size, - config, q_out, r_out); + return CONCAT_EXPAND(FIELD, poly_division)( + numerator, numerator_size, denominator, denominator_size, config, q_out, q_size, r_out, r_size); } } // namespace icicle \ No newline at end of file diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 62c3a75bf..6ae5a414c 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -813,20 +813,27 @@ TYPED_TEST(FieldApiTest, polynomialEval) TYPED_TEST(FieldApiTest, polynomialDivision) { const uint64_t numerator_size = 1 << 4; - const uint64_t denumerator_size = 1 << 3; - const int64_t max_num_deg = numerator_size - 1; - const int64_t max_denum_deg = denumerator_size - 1; - const uint64_t q_size = max_num_deg - max_denum_deg + 1; - const uint64_t r_size = max_num_deg + 1; + const uint64_t denominator_size = 1 << 3; + const uint64_t q_size = numerator_size - denominator_size + 1; + const uint64_t r_size = numerator_size; const int batch_size = 10 + rand() % 10; // basically we compute q(x),r(x) for a(x)=q(x)b(x)+r(x) by dividing a(x)/b(x) // randomize matrix with rows/cols as polynomials auto numerator = std::make_unique(numerator_size * batch_size); - auto denumerator = std::make_unique(denumerator_size * batch_size); + auto denominator = std::make_unique(denominator_size * batch_size); TypeParam::rand_host_many(numerator.get(), numerator_size * batch_size); - TypeParam::rand_host_many(denumerator.get(), denumerator_size * batch_size); + TypeParam::rand_host_many(denominator.get(), denominator_size * batch_size); + + // Add padding to each row so that the degree is lower than the size + const int zero_pad_length = 5; + for (int i = 0; i < batch_size; ++i) { + for (int j = 0; j < zero_pad_length; ++j) { + numerator[i * numerator_size + numerator_size - zero_pad_length + j] = TypeParam::zero(); + denominator[i * denominator_size + denominator_size - zero_pad_length + j] = TypeParam::zero(); + } + } for (auto device : s_registered_devices) { ICICLE_CHECK(icicle_set_device(device)); @@ -837,24 +844,30 @@ TYPED_TEST(FieldApiTest, polynomialDivision) auto r = std::make_unique(r_size * batch_size); auto config = default_vec_ops_config(); - config.batch_size = batch_size; + config.batch_size = columns_batch ? batch_size - zero_pad_length : batch_size; // skip the zero cols config.columns_batch = columns_batch; + // TODO v3.2 support column batch for this API + if (columns_batch) { + ICICLE_LOG_INFO << "Skipping polynomial division column batch"; + continue; + } + ICICLE_CHECK(polynomial_division( - numerator.get(), max_num_deg, numerator_size, denumerator.get(), max_denum_deg, denumerator_size, q_size, - r_size, config, q.get(), r.get())); + numerator.get(), numerator_size, denominator.get(), denominator_size, config, q.get(), q_size, r.get(), + r_size)); // test a(x)=q(x)b(x)+r(x) in random point const auto rand_x = TypeParam::rand_host(); - auto ax = std::make_unique(batch_size); - auto bx = std::make_unique(batch_size); - auto qx = std::make_unique(batch_size); - auto rx = std::make_unique(batch_size); + auto ax = std::make_unique(config.batch_size); + auto bx = std::make_unique(config.batch_size); + auto qx = std::make_unique(config.batch_size); + auto rx = std::make_unique(config.batch_size); polynomial_eval(numerator.get(), numerator_size, &rand_x, 1, config, ax.get()); - polynomial_eval(denumerator.get(), denumerator_size, &rand_x, 1, config, bx.get()); + polynomial_eval(denominator.get(), denominator_size, &rand_x, 1, config, bx.get()); polynomial_eval(q.get(), q_size, &rand_x, 1, config, qx.get()); polynomial_eval(r.get(), r_size, &rand_x, 1, config, rx.get()); - for (int i = 0; i < batch_size; ++i) { + for (int i = 0; i < config.batch_size; ++i) { // ICICLE_LOG_DEBUG << "ax=" << ax[i] << ", bx=" << bx[i] << ", qx=" << qx[i] << ", rx=" << rx[i]; ASSERT_EQ(ax[i], qx[i] * bx[i] + rx[i]); } From b361b0fd738bc78d0da53c05752284bdc5a4fcf3 Mon Sep 17 00:00:00 2001 From: Yuval Shekel Date: Mon, 4 Nov 2024 18:07:51 +0200 Subject: [PATCH 34/43] vector div for extension field and test fix for missing ext field apis --- icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 1 + .../include/icicle/backend/vec_ops_backend.h | 13 +- icicle/src/vec_ops.cpp | 27 ++- icicle/tests/test_field_api.cpp | 166 ++++++++---------- 4 files changed, 113 insertions(+), 94 deletions(-) diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index b22c1ade2..913793ef5 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -496,6 +496,7 @@ REGISTER_VECTOR_ADD_EXT_FIELD_BACKEND("CPU", cpu_vector_add); REGISTER_VECTOR_ACCUMULATE_EXT_FIELD_BACKEND("CPU", cpu_vector_accumulate); REGISTER_VECTOR_SUB_EXT_FIELD_BACKEND("CPU", cpu_vector_sub); REGISTER_VECTOR_MUL_EXT_FIELD_BACKEND("CPU", cpu_vector_mul); +REGISTER_VECTOR_DIV_EXT_FIELD_BACKEND("CPU", cpu_vector_div); REGISTER_CONVERT_MONTGOMERY_EXT_FIELD_BACKEND("CPU", cpu_convert_montgomery); #endif // EXT_FIELD diff --git a/icicle/include/icicle/backend/vec_ops_backend.h b/icicle/include/icicle/backend/vec_ops_backend.h index 69b64c893..36b41760e 100644 --- a/icicle/include/icicle/backend/vec_ops_backend.h +++ b/icicle/include/icicle/backend/vec_ops_backend.h @@ -302,6 +302,16 @@ namespace icicle { }(); \ } + void register_extension_vector_div(const std::string& deviceType, extFieldVectorOpImpl impl); + + #define REGISTER_VECTOR_DIV_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_vec_div_ext_field) = []() -> bool { \ + register_extension_vector_div(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + using extFieldConvertMontgomeryImpl = std::function; diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp index ebb86e0c1..5c56facf8 100644 --- a/icicle/src/vec_ops.cpp +++ b/icicle/src/vec_ops.cpp @@ -180,6 +180,23 @@ namespace icicle { return CONCAT_EXPAND(FIELD, vector_div)(vec_a, vec_b, size, &config, output); } +#ifdef EXT_FIELD + ICICLE_DISPATCHER_INST(VectorDivExtFieldDispatcher, extension_vector_div, extFieldVectorOpImpl); + + extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_div)( + const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config, extension_t* output) + { + return VectorDivExtFieldDispatcher::execute(vec_a, vec_b, size, *config, output); + } + + template <> + eIcicleError vector_div( + const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config, extension_t* output) + { + return CONCAT_EXPAND(FIELD, extension_vector_div)(vec_a, vec_b, size, &config, output); + } +#endif // EXT_FIELD + /*********************************** (Scalar + Vector) ELEMENT WISE ***********************************/ ICICLE_DISPATCHER_INST(ScalarAddDispatcher, scalar_add_vec, scalarVectorOpImpl); @@ -349,11 +366,12 @@ namespace icicle { const extension_t* input, uint64_t offset, uint64_t stride, - uint64_t size, + uint64_t size_in, + uint64_t size_out, const VecOpsConfig* config, extension_t* output) { - return ExtFieldSliceDispatcher::execute(input, offset, stride, size, *config, output); + return ExtFieldSliceDispatcher::execute(input, offset, stride, size_in, size_out, *config, output); } template <> @@ -361,11 +379,12 @@ namespace icicle { const extension_t* input, uint64_t offset, uint64_t stride, - uint64_t size, + uint64_t size_in, + uint64_t size_out, const VecOpsConfig& config, extension_t* output) { - return CONCAT_EXPAND(FIELD, extension_slice)(input, offset, stride, size, &config, output); + return CONCAT_EXPAND(FIELD, extension_slice)(input, offset, stride, size_in, size_out, &config, output); } #endif // EXT_FIELD diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 6ae5a414c..67f7107d4 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -33,8 +33,7 @@ static inline std::string s_reference_target; static inline std::vector s_registered_devices; bool s_is_cuda_registered; // TODO Yuval remove this -template -class FieldApiTest : public ::testing::Test +class FieldApiTestBase : public ::testing::Test { public: // SetUpTestSuite/TearDownTestSuite are called once for the entire test suite @@ -60,7 +59,12 @@ class FieldApiTest : public ::testing::Test // SetUp/TearDown are called before and after each test void SetUp() override {} void TearDown() override {} +}; +template +class FieldApiTest : public FieldApiTestBase +{ +public: void random_samples(T* arr, uint64_t count) { for (uint64_t i = 0; i < count; i++) @@ -183,9 +187,9 @@ TYPED_TEST(FieldApiTest, vectorVectorOps) run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_mul, "vector mul", ITERS); ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); - // // div - FieldApiTest::random_samples(in_a.get(), total_size); - FieldApiTest::random_samples(in_b.get(), total_size); + // div + TypeParam::rand_host_many(in_a.get(), total_size); + TypeParam::rand_host_many(in_b.get(), total_size); // reference if (!s_is_cuda_registered) { for (int i = 0; i < total_size; i++) { @@ -253,7 +257,7 @@ TYPED_TEST(FieldApiTest, montgomeryConversion) ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); } -TYPED_TEST(FieldApiTest, VectorReduceOps) +TEST_F(FieldApiTestBase, VectorReduceOps) { int seed = time(0); srand(seed); @@ -267,17 +271,17 @@ TYPED_TEST(FieldApiTest, VectorReduceOps) ICICLE_LOG_DEBUG << "batch_size = " << batch_size; ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; - auto in_a = std::make_unique(total_size); - auto out_main = std::make_unique(batch_size); - auto out_ref = std::make_unique(batch_size); + auto in_a = std::make_unique(total_size); + auto out_main = std::make_unique(batch_size); + auto out_ref = std::make_unique(batch_size); auto vector_accumulate_wrapper = - [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) { + [](scalar_t* a, const scalar_t* b, uint64_t size, const VecOpsConfig& config, scalar_t* /*out*/) { return vector_accumulate(a, b, size, config); }; auto run = - [&](const std::string& dev_type, TypeParam* out, bool measure, auto vec_op_func, const char* msg, int iters) { + [&](const std::string& dev_type, scalar_t* out, bool measure, auto vec_op_func, const char* msg, int iters) { Device dev = {dev_type, 0}; icicle_set_device(dev); auto config = default_vec_ops_config(); @@ -295,10 +299,10 @@ TYPED_TEST(FieldApiTest, VectorReduceOps) }; // sum - FieldApiTest::random_samples(in_a.get(), total_size); + scalar_t::rand_host_many(in_a.get(), total_size); // reference for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { - out_ref[idx_in_batch] = TypeParam::from(0); + out_ref[idx_in_batch] = scalar_t::from(0); } if (!s_is_cuda_registered) { for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { @@ -308,16 +312,16 @@ TYPED_TEST(FieldApiTest, VectorReduceOps) } } } else { - run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sum, "vector sum", ITERS); + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sum, "vector sum", ITERS); } - run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sum, "vector sum", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(TypeParam))); + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sum, "vector sum", ITERS); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(scalar_t))); // product - FieldApiTest::random_samples(in_a.get(), total_size); + scalar_t::rand_host_many(in_a.get(), total_size); if (!s_is_cuda_registered) { for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { - out_ref[idx_in_batch] = TypeParam::from(1); + out_ref[idx_in_batch] = scalar_t::from(1); } for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) { @@ -326,13 +330,13 @@ TYPED_TEST(FieldApiTest, VectorReduceOps) } } } else { - run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_product, "vector product", ITERS); + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_product, "vector product", ITERS); } - run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_product, "vector product", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(TypeParam))); + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_product, "vector product", ITERS); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(scalar_t))); } -TYPED_TEST(FieldApiTest, scalarVectorOps) +TEST_F(FieldApiTestBase, scalarVectorOps) { int seed = time(0); srand(seed); @@ -346,21 +350,21 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; const int total_size = N * batch_size; - auto scalar_a = std::make_unique(batch_size); - auto in_b = std::make_unique(total_size); - auto out_main = std::make_unique(total_size); - auto out_ref = std::make_unique(total_size); + auto scalar_a = std::make_unique(batch_size); + auto in_b = std::make_unique(total_size); + auto out_main = std::make_unique(total_size); + auto out_ref = std::make_unique(total_size); ICICLE_LOG_DEBUG << "N = " << N; ICICLE_LOG_DEBUG << "batch_size = " << batch_size; ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch; auto vector_accumulate_wrapper = - [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) { + [](scalar_t* a, const scalar_t* b, uint64_t size, const VecOpsConfig& config, scalar_t* /*out*/) { return vector_accumulate(a, b, size, config); }; auto run = - [&](const std::string& dev_type, TypeParam* out, bool measure, auto vec_op_func, const char* msg, int iters) { + [&](const std::string& dev_type, scalar_t* out, bool measure, auto vec_op_func, const char* msg, int iters) { Device dev = {dev_type, 0}; icicle_set_device(dev); auto config = default_vec_ops_config(); @@ -378,8 +382,8 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) }; // scalar add vec - FieldApiTest::random_samples(scalar_a.get(), batch_size); - FieldApiTest::random_samples(in_b.get(), total_size); + scalar_t::rand_host_many(scalar_a.get(), batch_size); + scalar_t::rand_host_many(in_b.get(), total_size); // reference if (!s_is_cuda_registered) { @@ -390,15 +394,15 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) } } } else { - run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_add_vec, "scalar add vec", ITERS); + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_add_vec, "scalar add vec", ITERS); } - run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_add_vec, "scalar add vec", ITERS); + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_add_vec, "scalar add vec", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t))); // scalar sub vec - FieldApiTest::random_samples(scalar_a.get(), batch_size); - FieldApiTest::random_samples(in_b.get(), total_size); + scalar_t::rand_host_many(scalar_a.get(), batch_size); + scalar_t::rand_host_many(in_b.get(), total_size); if (!s_is_cuda_registered) { for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { @@ -408,15 +412,15 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) } } } else { - run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_sub_vec, "scalar sub vec", ITERS); + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_sub_vec, "scalar sub vec", ITERS); } - run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_sub_vec, "scalar sub vec", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_sub_vec, "scalar sub vec", ITERS); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t))); // scalar mul vec - FieldApiTest::random_samples(scalar_a.get(), batch_size); - FieldApiTest::random_samples(in_b.get(), total_size); + scalar_t::rand_host_many(scalar_a.get(), batch_size); + scalar_t::rand_host_many(in_b.get(), total_size); if (!s_is_cuda_registered) { for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { @@ -426,10 +430,10 @@ TYPED_TEST(FieldApiTest, scalarVectorOps) } } } else { - run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_mul_vec, "scalar mul vec", ITERS); + run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_mul_vec, "scalar mul vec", ITERS); } - run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_mul_vec, "scalar mul vec", ITERS); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam))); + run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_mul_vec, "scalar mul vec", ITERS); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t))); } TYPED_TEST(FieldApiTest, matrixAPIsAsync) @@ -519,7 +523,7 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync) // } // Option 3: Initialize the entire input array with random values - FieldApiTest::random_samples(h_inout.get(), total_size); + TypeParam::rand_host_many(h_inout.get(), total_size); // Reference implementation if (!s_is_cuda_registered) { @@ -666,6 +670,8 @@ TYPED_TEST(FieldApiTest, Slice) auto out_main = std::make_unique(total_size_out); auto out_ref = std::make_unique(total_size_out); + TypeParam::rand_host_many(in_a.get(), total_size_in); + auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) { Device dev = {dev_type, 0}; icicle_set_device(dev); @@ -683,25 +689,6 @@ TYPED_TEST(FieldApiTest, Slice) END_TIMER(SLICE, oss.str().c_str(), measure); }; - // // Option 1: Initialize each input vector in the batch with the same ascending values - // for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { - // for (uint32_t i = 0; i < size_in; i++) { - // if(columns_batch){ - // in_a[idx_in_batch + batch_size * i] = TypeParam::from(i); - // } else { - // in_a[idx_in_batch * size_in + i] = TypeParam::from(i); - // } - // } - // } - - // // Option 2: Initialize the entire input array with ascending values - // for (int i = 0; i < total_size_in; i++) { - // in_a[i] = TypeParam::from(i); - // } - - // Option 3: Initialize the entire input array with random values - FieldApiTest::random_samples(in_a.get(), total_size_in); - // Reference implementation if (!s_is_cuda_registered) { for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) { @@ -721,7 +708,7 @@ TYPED_TEST(FieldApiTest, Slice) ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size_out * sizeof(TypeParam))); } -TYPED_TEST(FieldApiTest, highestNonZeroIdx) +TEST_F(FieldApiTestBase, highestNonZeroIdx) { int seed = time(0); srand(seed); @@ -731,7 +718,7 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx) const bool columns_batch = rand() % 2; const int total_size = N * batch_size; - auto in_a = std::make_unique(total_size); + auto in_a = std::make_unique(total_size); for (int i = 0; i < batch_size; ++i) { // randomize different rows with zeros in the end auto size = std::max(int64_t(N) / 4 - i, int64_t(1)); @@ -762,7 +749,7 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx) ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(int64_t))); } -TYPED_TEST(FieldApiTest, polynomialEval) +TEST_F(FieldApiTestBase, polynomialEval) { int seed = time(0); srand(seed); @@ -780,12 +767,12 @@ TYPED_TEST(FieldApiTest, polynomialEval) const int total_coeffs_size = coeffs_size * batch_size; const int total_result_size = domain_size * batch_size; - auto in_coeffs = std::make_unique(total_coeffs_size); - auto in_domain = std::make_unique(domain_size); - auto out_main = std::make_unique(total_result_size); - auto out_ref = std::make_unique(total_result_size); + auto in_coeffs = std::make_unique(total_coeffs_size); + auto in_domain = std::make_unique(domain_size); + auto out_main = std::make_unique(total_result_size); + auto out_ref = std::make_unique(total_result_size); - auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) { + auto run = [&](const std::string& dev_type, scalar_t* out, bool measure, const char* msg, int iters) { Device dev = {dev_type, 0}; icicle_set_device(dev); auto config = default_vec_ops_config(); @@ -802,15 +789,15 @@ TYPED_TEST(FieldApiTest, polynomialEval) END_TIMER(polynomialEval, oss.str().c_str(), measure); }; - FieldApiTest::random_samples(in_coeffs.get(), total_coeffs_size); - FieldApiTest::random_samples(in_domain.get(), domain_size); + scalar_t::rand_host_many(in_coeffs.get(), total_coeffs_size); + scalar_t::rand_host_many(in_domain.get(), domain_size); run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "polynomial_eval", 1); run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "polynomial_eval", 1); - ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_result_size * sizeof(TypeParam))); + ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_result_size * sizeof(scalar_t))); } -TYPED_TEST(FieldApiTest, polynomialDivision) +TEST_F(FieldApiTestBase, polynomialDivision) { const uint64_t numerator_size = 1 << 4; const uint64_t denominator_size = 1 << 3; @@ -821,17 +808,17 @@ TYPED_TEST(FieldApiTest, polynomialDivision) // basically we compute q(x),r(x) for a(x)=q(x)b(x)+r(x) by dividing a(x)/b(x) // randomize matrix with rows/cols as polynomials - auto numerator = std::make_unique(numerator_size * batch_size); - auto denominator = std::make_unique(denominator_size * batch_size); - TypeParam::rand_host_many(numerator.get(), numerator_size * batch_size); - TypeParam::rand_host_many(denominator.get(), denominator_size * batch_size); + auto numerator = std::make_unique(numerator_size * batch_size); + auto denominator = std::make_unique(denominator_size * batch_size); + scalar_t::rand_host_many(numerator.get(), numerator_size * batch_size); + scalar_t::rand_host_many(denominator.get(), denominator_size * batch_size); // Add padding to each row so that the degree is lower than the size const int zero_pad_length = 5; for (int i = 0; i < batch_size; ++i) { for (int j = 0; j < zero_pad_length; ++j) { - numerator[i * numerator_size + numerator_size - zero_pad_length + j] = TypeParam::zero(); - denominator[i * denominator_size + denominator_size - zero_pad_length + j] = TypeParam::zero(); + numerator[i * numerator_size + numerator_size - zero_pad_length + j] = scalar_t::zero(); + denominator[i * denominator_size + denominator_size - zero_pad_length + j] = scalar_t::zero(); } } @@ -840,8 +827,8 @@ TYPED_TEST(FieldApiTest, polynomialDivision) for (int columns_batch = 0; columns_batch <= 1; columns_batch++) { ICICLE_LOG_DEBUG << "testing polynomial division on device " << device << " [column_batch=" << columns_batch << "]"; - auto q = std::make_unique(q_size * batch_size); - auto r = std::make_unique(r_size * batch_size); + auto q = std::make_unique(q_size * batch_size); + auto r = std::make_unique(r_size * batch_size); auto config = default_vec_ops_config(); config.batch_size = columns_batch ? batch_size - zero_pad_length : batch_size; // skip the zero cols @@ -857,11 +844,11 @@ TYPED_TEST(FieldApiTest, polynomialDivision) r_size)); // test a(x)=q(x)b(x)+r(x) in random point - const auto rand_x = TypeParam::rand_host(); - auto ax = std::make_unique(config.batch_size); - auto bx = std::make_unique(config.batch_size); - auto qx = std::make_unique(config.batch_size); - auto rx = std::make_unique(config.batch_size); + const auto rand_x = scalar_t::rand_host(); + auto ax = std::make_unique(config.batch_size); + auto bx = std::make_unique(config.batch_size); + auto qx = std::make_unique(config.batch_size); + auto rx = std::make_unique(config.batch_size); polynomial_eval(numerator.get(), numerator_size, &rand_x, 1, config, ax.get()); polynomial_eval(denominator.get(), denominator_size, &rand_x, 1, config, bx.get()); polynomial_eval(q.get(), q_size, &rand_x, 1, config, qx.get()); @@ -916,7 +903,8 @@ TYPED_TEST(FieldApiTest, ntt) const int total_size = N * batch_size; auto scalars = std::make_unique(total_size); - FieldApiTest::random_samples(scalars.get(), total_size); + TypeParam::rand_host_many(scalars.get(), total_size); + auto out_main = std::make_unique(total_size); auto out_ref = std::make_unique(total_size); auto run = [&](const std::string& dev_type, TypeParam* out, const char* msg, bool measure, int iters) { From fd208f4af7f3601de7c756bf55ee3e53dbd5849d Mon Sep 17 00:00:00 2001 From: Yuval Shekel Date: Mon, 4 Nov 2024 18:08:08 +0200 Subject: [PATCH 35/43] remove wrong file --- icicle_v3/include/icicle/mmcs.h | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 icicle_v3/include/icicle/mmcs.h diff --git a/icicle_v3/include/icicle/mmcs.h b/icicle_v3/include/icicle/mmcs.h deleted file mode 100644 index 94394b822..000000000 --- a/icicle_v3/include/icicle/mmcs.h +++ /dev/null @@ -1,30 +0,0 @@ -#pragma once - -#include "errors.h" -#include "runtime.h" -#include "hash.h" -#include "merkle_tree.h" -#include "icicle/utils/utils.h" - -#include -#include - - -template - struct Matrix { - T* values; - size_t width; - size_t height; - }; - -eIcicleError build_mmcs_tree(const Matrix* inputs, - const unsigned int number_of_inputs, - limb_t** outputs, - const Hash& hash, - const Hash& compression, - const MerkleTreeConfig& config); - - //create hash <-hasher,compressor - - //sort, and call merkle tree - //how to return outputs? \ No newline at end of file From 4de758f003bc0854e1c45c49eae03a71c8b1e0d3 Mon Sep 17 00:00:00 2001 From: Yuval Shekel Date: Mon, 4 Nov 2024 18:10:24 +0200 Subject: [PATCH 36/43] revert api headers --- icicle/include/icicle/api/bls12_377.h | 39 --------------------------- icicle/include/icicle/api/bls12_381.h | 39 --------------------------- icicle/include/icicle/api/grumpkin.h | 10 ------- 3 files changed, 88 deletions(-) diff --git a/icicle/include/icicle/api/bls12_377.h b/icicle/include/icicle/api/bls12_377.h index 9cd0e9d66..972bd59e2 100644 --- a/icicle/include/icicle/api/bls12_377.h +++ b/icicle/include/icicle/api/bls12_377.h @@ -10,45 +10,6 @@ #include "icicle/msm.h" #include "icicle/vec_ops.h" -extern "C" bool bls12_377_g2_eq(bls12_377::g2_projective_t* point1, bls12_377::g2_projective_t* point2); - -extern "C" void bls12_377_g2_to_affine(bls12_377::g2_projective_t* point, bls12_377::g2_affine_t* point_out); - -extern "C" void bls12_377_g2_generate_projective_points(bls12_377::g2_projective_t* points, int size); - -extern "C" void bls12_377_g2_generate_affine_points(bls12_377::g2_affine_t* points, int size); - -extern "C" eIcicleError bls12_377_g2_affine_convert_montgomery( - const bls12_377::g2_affine_t* input, - size_t n, - bool is_into, - const VecOpsConfig* config, - bls12_377::g2_affine_t* output); - -extern "C" eIcicleError bls12_377_g2_projective_convert_montgomery( - const bls12_377::g2_projective_t* input, - size_t n, - bool is_into, - const VecOpsConfig* config, - bls12_377::g2_projective_t* output); - -extern "C" eIcicleError bls12_377_ecntt( - const bls12_377::projective_t* input, - int size, - NTTDir dir, - const NTTConfig* config, - bls12_377::projective_t* output); - -extern "C" eIcicleError bls12_377_precompute_msm_bases( - const bls12_377::affine_t* bases, int nof_bases, const MSMConfig* config, bls12_377::affine_t* output_bases); - -extern "C" eIcicleError bls12_377_msm( - const bls12_377::scalar_t* scalars, - const bls12_377::affine_t* points, - int msm_size, - const MSMConfig* config, - bls12_377::projective_t* out); - extern "C" bool bls12_377_eq(bls12_377::projective_t* point1, bls12_377::projective_t* point2); extern "C" void bls12_377_to_affine(bls12_377::projective_t* point, bls12_377::affine_t* point_out); diff --git a/icicle/include/icicle/api/bls12_381.h b/icicle/include/icicle/api/bls12_381.h index 01165b2d6..03e3bdd36 100644 --- a/icicle/include/icicle/api/bls12_381.h +++ b/icicle/include/icicle/api/bls12_381.h @@ -10,45 +10,6 @@ #include "icicle/msm.h" #include "icicle/vec_ops.h" -extern "C" bool bls12_381_g2_eq(bls12_381::g2_projective_t* point1, bls12_381::g2_projective_t* point2); - -extern "C" void bls12_381_g2_to_affine(bls12_381::g2_projective_t* point, bls12_381::g2_affine_t* point_out); - -extern "C" void bls12_381_g2_generate_projective_points(bls12_381::g2_projective_t* points, int size); - -extern "C" void bls12_381_g2_generate_affine_points(bls12_381::g2_affine_t* points, int size); - -extern "C" eIcicleError bls12_381_g2_affine_convert_montgomery( - const bls12_381::g2_affine_t* input, - size_t n, - bool is_into, - const VecOpsConfig* config, - bls12_381::g2_affine_t* output); - -extern "C" eIcicleError bls12_381_g2_projective_convert_montgomery( - const bls12_381::g2_projective_t* input, - size_t n, - bool is_into, - const VecOpsConfig* config, - bls12_381::g2_projective_t* output); - -extern "C" eIcicleError bls12_381_ecntt( - const bls12_381::projective_t* input, - int size, - NTTDir dir, - const NTTConfig* config, - bls12_381::projective_t* output); - -extern "C" eIcicleError bls12_381_precompute_msm_bases( - const bls12_381::affine_t* bases, int nof_bases, const MSMConfig* config, bls12_381::affine_t* output_bases); - -extern "C" eIcicleError bls12_381_msm( - const bls12_381::scalar_t* scalars, - const bls12_381::affine_t* points, - int msm_size, - const MSMConfig* config, - bls12_381::projective_t* out); - extern "C" bool bls12_381_eq(bls12_381::projective_t* point1, bls12_381::projective_t* point2); extern "C" void bls12_381_to_affine(bls12_381::projective_t* point, bls12_381::affine_t* point_out); diff --git a/icicle/include/icicle/api/grumpkin.h b/icicle/include/icicle/api/grumpkin.h index 3acdfa5c1..235b72843 100644 --- a/icicle/include/icicle/api/grumpkin.h +++ b/icicle/include/icicle/api/grumpkin.h @@ -9,16 +9,6 @@ #include "icicle/msm.h" #include "icicle/vec_ops.h" -extern "C" eIcicleError grumpkin_precompute_msm_bases( - const grumpkin::affine_t* bases, int nof_bases, const MSMConfig* config, grumpkin::affine_t* output_bases); - -extern "C" eIcicleError grumpkin_msm( - const grumpkin::scalar_t* scalars, - const grumpkin::affine_t* points, - int msm_size, - const MSMConfig* config, - grumpkin::projective_t* out); - extern "C" bool grumpkin_eq(grumpkin::projective_t* point1, grumpkin::projective_t* point2); extern "C" void grumpkin_to_affine(grumpkin::projective_t* point, grumpkin::affine_t* point_out); From c9788e9fcc50384dd9b7547f6aede10fe6554c21 Mon Sep 17 00:00:00 2001 From: Yuval Shekel Date: Mon, 4 Nov 2024 18:16:36 +0200 Subject: [PATCH 37/43] minor cleanup --- .../icicle/polynomials/default_backend/default_poly_backend.h | 2 +- icicle/tests/test_field_api.cpp | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h index bfa57f9c3..ef59f816f 100644 --- a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h +++ b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h @@ -65,7 +65,7 @@ namespace icicle { config.is_async = true; config.stream = m_stream; - ICICLE_CHECK(icicle::slice(in_coeffs, offset, stride, size, out_size, config, out_coeffs)); + ICICLE_CHECK(icicle::slice(in_coeffs, offset, stride, in_size, out_size, config, out_coeffs)); } void add_sub(PolyContext& res, PolyContext a, PolyContext b, bool add1_sub0) diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp index 67f7107d4..703018797 100644 --- a/icicle/tests/test_field_api.cpp +++ b/icicle/tests/test_field_api.cpp @@ -2,9 +2,7 @@ #include #include #include "dlfcn.h" -#include #include -#include // For system #include "icicle/runtime.h" #include "icicle/vec_ops.h" From fdc7a5c428db9cff13b3750e1ad5ebc8937ae599 Mon Sep 17 00:00:00 2001 From: Yuval Shekel Date: Mon, 4 Nov 2024 19:14:30 +0200 Subject: [PATCH 38/43] update go vec-ops config struct --- wrappers/golang/core/vec_ops.go | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/wrappers/golang/core/vec_ops.go b/wrappers/golang/core/vec_ops.go index 08b87ef08..3671f0653 100644 --- a/wrappers/golang/core/vec_ops.go +++ b/wrappers/golang/core/vec_ops.go @@ -29,7 +29,15 @@ type VecOpsConfig struct { /// non-blocking and you'll need to synchronize it explicitly by calling /// `SynchronizeStream`. If set to false, the function will block the current CPU thread. IsAsync bool - Ext config_extension.ConfigExtensionHandler + /// Number of vectors (or operations) to process in a batch. + /// Each vector operation will be performed independently on each batch element. + /// Default value: 1. + BatchSize int32 + /// True if the batched vectors are stored as columns in a 2D array (i.e., the vectors are + /// strided in memory as columns of a matrix). If false, the batched vectors are stored + /// contiguously in memory (e.g., as rows or in a flat array). Default value: false. + ColumnsBatch bool + Ext config_extension.ConfigExtensionHandler } /** @@ -43,6 +51,8 @@ func DefaultVecOpsConfig() VecOpsConfig { false, // isBOnDevice false, // isResultOnDevice false, // IsAsync + 1, // BatchSize + false, // ColumnsBatch nil, // Ext } From 198d196d873b073b81d4c344a1cb1399a518d759 Mon Sep 17 00:00:00 2001 From: Yuval Shekel Date: Mon, 4 Nov 2024 19:42:48 +0200 Subject: [PATCH 39/43] fix C++ example --- .../c++/polynomial-multiplication/example.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/examples/c++/polynomial-multiplication/example.cpp b/examples/c++/polynomial-multiplication/example.cpp index 9bd90b842..1fdfeb501 100644 --- a/examples/c++/polynomial-multiplication/example.cpp +++ b/examples/c++/polynomial-multiplication/example.cpp @@ -69,21 +69,18 @@ int main(int argc, char** argv) ICICLE_CHECK(bn254_ntt(polyB.get(), NTT_SIZE, NTTDir::kForward, &ntt_config, d_polyB)); // (4) multiply A,B - VecOpsConfig config{ - nullptr, - true, // is_a_on_device - true, // is_b_on_device - true, // is_result_on_device - false, // is_async - nullptr // ext - }; - ICICLE_CHECK(bn254_vector_mul(d_polyA, d_polyB, NTT_SIZE, &config, d_polyRes)); + VecOpsConfig config = default_vec_ops_config(); + config.is_a_on_device = true; + config.is_b_on_device = true; + config.is_result_on_device = true; + + ICICLE_CHECK(vector_mul(d_polyA, d_polyB, NTT_SIZE, config, d_polyRes)); // (5) INTT (in place) ntt_config.are_inputs_on_device = true; ntt_config.are_outputs_on_device = true; ntt_config.ordering = Ordering::kMN; - ICICLE_CHECK(bn254_ntt(d_polyRes, NTT_SIZE, NTTDir::kInverse, &ntt_config, d_polyRes)); + ICICLE_CHECK(ntt(d_polyRes, NTT_SIZE, NTTDir::kInverse, ntt_config, d_polyRes)); if (print) { END_TIMER(poly_multiply, "polynomial multiplication took"); } From 8f827d6727b056d469b34aab90dabd4de30070cf Mon Sep 17 00:00:00 2001 From: Emir Soyturk Date: Mon, 4 Nov 2024 21:30:23 +0300 Subject: [PATCH 40/43] vec_ops rust binding and tests (#642) --- wrappers/rust/icicle-core/src/vec_ops/mod.rs | 350 ++++++++++++++++++ .../rust/icicle-core/src/vec_ops/tests.rs | 234 +++++++++++- 2 files changed, 583 insertions(+), 1 deletion(-) diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs index ba22b776d..277846ee8 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs @@ -13,6 +13,8 @@ pub struct VecOpsConfig { pub is_b_on_device: bool, pub is_result_on_device: bool, pub is_async: bool, + pub batch_size: i32, + pub columns_batch: bool, pub ext: ConfigExtension, } @@ -24,6 +26,8 @@ impl VecOpsConfig { is_b_on_device: false, is_result_on_device: false, is_async: false, + batch_size: 1, + columns_batch: false, ext: ConfigExtension::new(), } } @@ -58,6 +62,46 @@ pub trait VecOps { cfg: &VecOpsConfig, ) -> Result<(), eIcicleError>; + fn div( + a: &(impl HostOrDeviceSlice + ?Sized), + b: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError>; + + fn sum( + a: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError>; + + fn product( + a: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError>; + + fn scalar_add( + a: &(impl HostOrDeviceSlice + ?Sized), + b: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError>; + + fn scalar_sub( + a: &(impl HostOrDeviceSlice + ?Sized), + b: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError>; + + fn scalar_mul( + a: &(impl HostOrDeviceSlice + ?Sized), + b: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError>; + fn transpose( input: &(impl HostOrDeviceSlice + ?Sized), nof_rows: u32, @@ -76,6 +120,16 @@ pub trait VecOps { input: &mut (impl HostOrDeviceSlice + ?Sized), cfg: &VecOpsConfig, ) -> Result<(), eIcicleError>; + + fn slice( + input: &(impl HostOrDeviceSlice + ?Sized), + offset: u64, + stride: u64, + size_in: u64, + size_out: u64, + cfg: &VecOpsConfig, + output: &mut (impl HostOrDeviceSlice + ?Sized), + ) -> Result<(), eIcicleError>; } fn check_vec_ops_args<'a, F>( @@ -166,6 +220,88 @@ where <::Config as VecOps>::mul(a, b, result, &cfg) } +pub fn div_scalars( + a: &(impl HostOrDeviceSlice + ?Sized), + b: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, +) -> Result<(), eIcicleError> +where + F: FieldImpl, + ::Config: VecOps, +{ + let cfg = check_vec_ops_args(a, b, result, cfg); + <::Config as VecOps>::div(a, b, result, &cfg) +} + +pub fn sum_scalars( + a: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, +) -> Result<(), eIcicleError> +where + F: FieldImpl, + ::Config: VecOps, +{ + let cfg = check_vec_ops_args(a, a, result, cfg); //TODO: emirsoyturk + <::Config as VecOps>::sum(a, result, &cfg) +} + +pub fn product_scalars( + a: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, +) -> Result<(), eIcicleError> +where + F: FieldImpl, + ::Config: VecOps, +{ + let cfg = check_vec_ops_args(a, a, result, cfg); //TODO: emirsoyturk + <::Config as VecOps>::product(a, result, &cfg) +} + +pub fn scalar_add( + a: &(impl HostOrDeviceSlice + ?Sized), + b: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, +) -> Result<(), eIcicleError> +where + F: FieldImpl, + ::Config: VecOps, +{ + let cfg = check_vec_ops_args(b, b, result, cfg); //TODO: emirsoyturk + <::Config as VecOps>::scalar_add(a, b, result, &cfg) +} + +pub fn scalar_sub( + a: &(impl HostOrDeviceSlice + ?Sized), + b: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, +) -> Result<(), eIcicleError> +where + F: FieldImpl, + ::Config: VecOps, +{ + let cfg = check_vec_ops_args(b, b, result, cfg); //TODO: emirsoyturk + <::Config as VecOps>::scalar_sub(a, b, result, &cfg) +} + +pub fn scalar_mul( + a: &(impl HostOrDeviceSlice + ?Sized), + b: &(impl HostOrDeviceSlice + ?Sized), + result: &mut (impl HostOrDeviceSlice + ?Sized), + cfg: &VecOpsConfig, +) -> Result<(), eIcicleError> +where + F: FieldImpl, + ::Config: VecOps, +{ + let cfg = check_vec_ops_args(b, b, result, cfg); //TODO: emirsoyturk + <::Config as VecOps>::scalar_mul(a, b, result, &cfg) +} + pub fn transpose_matrix( input: &(impl HostOrDeviceSlice + ?Sized), nof_rows: u32, @@ -205,6 +341,23 @@ where <::Config as VecOps>::bit_reverse_inplace(input, &cfg) } +pub fn slice( + input: &(impl HostOrDeviceSlice + ?Sized), + offset: u64, + stride: u64, + size_in: u64, + size_out: u64, + cfg: &VecOpsConfig, + output: &mut (impl HostOrDeviceSlice + ?Sized), +) -> Result<(), eIcicleError> +where + F: FieldImpl, + ::Config: VecOps, +{ + <::Config as VecOps>::slice(input, offset, stride, size_in, size_out, &cfg, output) +} + + #[macro_export] macro_rules! impl_vec_ops_field { ( @@ -255,6 +408,59 @@ macro_rules! impl_vec_ops_field { result: *mut $field, ) -> eIcicleError; + #[link_name = concat!($field_prefix, "_vector_div")] + pub(crate) fn vector_div_ffi( + a: *const $field, + b: *const $field, + size: u32, + cfg: *const VecOpsConfig, + result: *mut $field, + ) -> eIcicleError; + + #[link_name = concat!($field_prefix, "_vector_sum")] + pub(crate) fn vector_sum_ffi( + a: *const $field, + size: u32, + cfg: *const VecOpsConfig, + result: *mut $field, + ) -> eIcicleError; + + + #[link_name = concat!($field_prefix, "_vector_product")] + pub(crate) fn vector_product_ffi( + a: *const $field, + size: u32, + cfg: *const VecOpsConfig, + result: *mut $field, + ) -> eIcicleError; + + #[link_name = concat!($field_prefix, "_scalar_add_vec")] + pub(crate) fn scalar_add_ffi( + a: *const $field, + b: *const $field, + size: u32, + cfg: *const VecOpsConfig, + result: *mut $field, + ) -> eIcicleError; + + #[link_name = concat!($field_prefix, "_scalar_sub_vec")] + pub(crate) fn scalar_sub_ffi( + a: *const $field, + b: *const $field, + size: u32, + cfg: *const VecOpsConfig, + result: *mut $field, + ) -> eIcicleError; + + #[link_name = concat!($field_prefix, "_scalar_mul_vec")] + pub(crate) fn scalar_mul_ffi( + a: *const $field, + b: *const $field, + size: u32, + cfg: *const VecOpsConfig, + result: *mut $field, + ) -> eIcicleError; + #[link_name = concat!($field_prefix, "_matrix_transpose")] pub(crate) fn matrix_transpose_ffi( input: *const $field, @@ -271,6 +477,17 @@ macro_rules! impl_vec_ops_field { config: *const VecOpsConfig, output: *mut $field, ) -> eIcicleError; + + #[link_name = concat!($field_prefix, "_slice")] + pub(crate) fn slice_ffi( + input: *const $field, + offset: u64, + stride: u64, + size_in: u64, + size_out: u64, + cfg: *const VecOpsConfig, + output: *mut $field, + ) -> eIcicleError; } } @@ -344,6 +561,110 @@ macro_rules! impl_vec_ops_field { .wrap() } } + + fn div( + a: &(impl HostOrDeviceSlice<$field> + ?Sized), + b: &(impl HostOrDeviceSlice<$field> + ?Sized), + result: &mut (impl HostOrDeviceSlice<$field> + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError> { + unsafe { + $field_prefix_ident::vector_div_ffi( + a.as_ptr(), + b.as_ptr(), + a.len() as u32, + cfg as *const VecOpsConfig, + result.as_mut_ptr(), + ) + .wrap() + } + } + + fn sum( + a: &(impl HostOrDeviceSlice<$field> + ?Sized), + result: &mut (impl HostOrDeviceSlice<$field> + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError> { + unsafe { + $field_prefix_ident::vector_sum_ffi( + a.as_ptr(), + a.len() as u32, + cfg as *const VecOpsConfig, + result.as_mut_ptr(), + ) + .wrap() + } + } + + fn product( + a: &(impl HostOrDeviceSlice<$field> + ?Sized), + result: &mut (impl HostOrDeviceSlice<$field> + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError> { + unsafe { + $field_prefix_ident::vector_sum_ffi( + a.as_ptr(), + a.len() as u32, + cfg as *const VecOpsConfig, + result.as_mut_ptr(), + ) + .wrap() + } + } + + fn scalar_add( + a: &(impl HostOrDeviceSlice<$field> + ?Sized), + b: &(impl HostOrDeviceSlice<$field> + ?Sized), + result: &mut (impl HostOrDeviceSlice<$field> + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError> { + unsafe { + $field_prefix_ident::scalar_add_ffi( + a.as_ptr(), + b.as_ptr(), + b.len() as u32, + cfg as *const VecOpsConfig, + result.as_mut_ptr(), + ) + .wrap() + } + } + + fn scalar_sub( + a: &(impl HostOrDeviceSlice<$field> + ?Sized), + b: &(impl HostOrDeviceSlice<$field> + ?Sized), + result: &mut (impl HostOrDeviceSlice<$field> + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError> { + unsafe { + $field_prefix_ident::scalar_sub_ffi( + a.as_ptr(), + b.as_ptr(), + b.len() as u32, + cfg as *const VecOpsConfig, + result.as_mut_ptr(), + ) + .wrap() + } + } + + fn scalar_mul( + a: &(impl HostOrDeviceSlice<$field> + ?Sized), + b: &(impl HostOrDeviceSlice<$field> + ?Sized), + result: &mut (impl HostOrDeviceSlice<$field> + ?Sized), + cfg: &VecOpsConfig, + ) -> Result<(), eIcicleError> { + unsafe { + $field_prefix_ident::scalar_mul_ffi( + a.as_ptr(), + b.as_ptr(), + b.len() as u32, + cfg as *const VecOpsConfig, + result.as_mut_ptr(), + ) + .wrap() + } + } fn transpose( input: &(impl HostOrDeviceSlice<$field> + ?Sized), @@ -394,6 +715,29 @@ macro_rules! impl_vec_ops_field { .wrap() } } + + fn slice( + input: &(impl HostOrDeviceSlice<$field> + ?Sized), + offset: u64, + stride: u64, + size_in: u64, + size_out: u64, + cfg: &VecOpsConfig, + output: &mut (impl HostOrDeviceSlice<$field> + ?Sized), + ) -> Result<(), eIcicleError> { + unsafe { + $field_prefix_ident::slice_ffi( + input.as_ptr(), + offset, + stride, + size_in, + size_out, + cfg as *const VecOpsConfig, + output.as_mut_ptr(), + ) + .wrap() + } + } } }; } @@ -436,6 +780,12 @@ macro_rules! impl_vec_ops_tests { initialize(); check_bit_reverse_inplace::<$field>() } + + #[test] + pub fn test_slice() { + initialize(); + check_slice::<$field>() + } } }; } diff --git a/wrappers/rust/icicle-core/src/vec_ops/tests.rs b/wrappers/rust/icicle-core/src/vec_ops/tests.rs index 6762f06c9..4a16fcb21 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/tests.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/tests.rs @@ -2,7 +2,7 @@ use crate::test_utilities; use crate::traits::GenerateRandom; use crate::vec_ops::{ - accumulate_scalars, add_scalars, bit_reverse, bit_reverse_inplace, mul_scalars, sub_scalars, transpose_matrix, + accumulate_scalars, add_scalars, bit_reverse, bit_reverse_inplace, mul_scalars, slice, div_scalars, sum_scalars, scalar_add, scalar_sub, scalar_mul, product_scalars, sub_scalars, transpose_matrix, FieldImpl, VecOps, VecOpsConfig, }; use icicle_runtime::device::Device; @@ -44,6 +44,12 @@ where check_vec_ops_scalars_add::(test_size); check_vec_ops_scalars_sub::(test_size); check_vec_ops_scalars_mul::(test_size); + check_vec_ops_scalars_div::(test_size); + check_vec_ops_scalars_sum::(test_size); + check_vec_ops_scalars_product::(test_size); + check_vec_ops_scalars_add_scalar::(test_size); + check_vec_ops_scalars_sub_scalar::(test_size); + check_vec_ops_scalars_mul_scalar::(test_size); check_vec_ops_scalars_accumulate::(test_size); } @@ -140,6 +146,191 @@ where .unwrap(); } +pub fn check_vec_ops_scalars_div(test_size: usize) +where + ::Config: VecOps + GenerateRandom, +{ + let a_main = F::Config::generate_random(test_size); + let b = F::Config::generate_random(test_size); + let mut result_main = vec![F::zero(); test_size]; + let mut result_ref = vec![F::zero(); test_size]; + + let a_main = HostSlice::from_slice(&a_main); + let b = HostSlice::from_slice(&b); + let result_main = HostSlice::from_mut_slice(&mut result_main); + let result_ref = HostSlice::from_mut_slice(&mut result_ref); + + let mut stream = IcicleStream::create().unwrap(); + let mut cfg = VecOpsConfig::default(); + cfg.stream_handle = *stream; + + test_utilities::test_set_main_device(); + div_scalars(a_main, b, result_main, &cfg).unwrap(); + + test_utilities::test_set_ref_device(); + div_scalars(a_main, b, result_ref, &cfg).unwrap(); + + assert_eq!(result_main.as_slice(), result_ref.as_slice()); + + stream + .destroy() + .unwrap(); +} + +pub fn check_vec_ops_scalars_sum(test_size: usize) +where + ::Config: VecOps + GenerateRandom, +{ + let a_main = F::Config::generate_random(test_size); + let mut result_main = vec![F::zero(); test_size]; + let mut result_ref = vec![F::zero(); test_size]; + + let a_main = HostSlice::from_slice(&a_main); + let result_main = HostSlice::from_mut_slice(&mut result_main); + let result_ref = HostSlice::from_mut_slice(&mut result_ref); + + let mut stream = IcicleStream::create().unwrap(); + let mut cfg = VecOpsConfig::default(); + cfg.stream_handle = *stream; + + test_utilities::test_set_main_device(); + sum_scalars(a_main, result_main, &cfg).unwrap(); + + test_utilities::test_set_ref_device(); + sum_scalars(a_main, result_ref, &cfg).unwrap(); + + assert_eq!(result_main.as_slice(), result_ref.as_slice()); + + stream + .destroy() + .unwrap(); +} + +pub fn check_vec_ops_scalars_product(test_size: usize) +where + ::Config: VecOps + GenerateRandom, +{ + let a_main = F::Config::generate_random(test_size); + let mut result_main = vec![F::zero(); test_size]; + let mut result_ref = vec![F::zero(); test_size]; + + let a_main = HostSlice::from_slice(&a_main); + let result_main = HostSlice::from_mut_slice(&mut result_main); + let result_ref = HostSlice::from_mut_slice(&mut result_ref); + + let mut stream = IcicleStream::create().unwrap(); + let mut cfg = VecOpsConfig::default(); + cfg.stream_handle = *stream; + + test_utilities::test_set_main_device(); + product_scalars(a_main, result_main, &cfg).unwrap(); + + test_utilities::test_set_ref_device(); + product_scalars(a_main, result_ref, &cfg).unwrap(); + + assert_eq!(result_main.as_slice(), result_ref.as_slice()); + + stream + .destroy() + .unwrap(); +} + +pub fn check_vec_ops_scalars_add_scalar(test_size: usize) +where + ::Config: VecOps + GenerateRandom, +{ + let a_main = F::Config::generate_random(1); + let b = F::Config::generate_random(test_size); + let mut result_main = vec![F::zero(); test_size]; + let mut result_ref = vec![F::zero(); test_size]; + + let a_main = HostSlice::from_slice(&a_main); + let b = HostSlice::from_slice(&b); + let result_main = HostSlice::from_mut_slice(&mut result_main); + let result_ref = HostSlice::from_mut_slice(&mut result_ref); + + let mut stream = IcicleStream::create().unwrap(); + let mut cfg = VecOpsConfig::default(); + cfg.stream_handle = *stream; + cfg.batch_size = 1; + + test_utilities::test_set_main_device(); + scalar_add(a_main, b, result_main, &cfg).unwrap(); + + test_utilities::test_set_ref_device(); + scalar_add(a_main, b, result_ref, &cfg).unwrap(); + + assert_eq!(result_main.as_slice(), result_ref.as_slice()); + + stream + .destroy() + .unwrap(); +} + +pub fn check_vec_ops_scalars_sub_scalar(test_size: usize) +where + ::Config: VecOps + GenerateRandom, +{ + let a_main = F::Config::generate_random(1); + let b = F::Config::generate_random(test_size); + let mut result_main = vec![F::zero(); test_size]; + let mut result_ref = vec![F::zero(); test_size]; + + let a_main = HostSlice::from_slice(&a_main); + let b = HostSlice::from_slice(&b); + let result_main = HostSlice::from_mut_slice(&mut result_main); + let result_ref = HostSlice::from_mut_slice(&mut result_ref); + + let mut stream = IcicleStream::create().unwrap(); + let mut cfg = VecOpsConfig::default(); + cfg.stream_handle = *stream; + cfg.batch_size = 1; + + test_utilities::test_set_main_device(); + scalar_sub(a_main, b, result_main, &cfg).unwrap(); + + test_utilities::test_set_ref_device(); + scalar_sub(a_main, b, result_ref, &cfg).unwrap(); + + assert_eq!(result_main.as_slice(), result_ref.as_slice()); + + stream + .destroy() + .unwrap(); +} + +pub fn check_vec_ops_scalars_mul_scalar(test_size: usize) +where + ::Config: VecOps + GenerateRandom, +{ + let a_main = F::Config::generate_random(1); + let b = F::Config::generate_random(test_size); + let mut result_main = vec![F::zero(); test_size]; + let mut result_ref = vec![F::zero(); test_size]; + + let a_main = HostSlice::from_slice(&a_main); + let b = HostSlice::from_slice(&b); + let result_main = HostSlice::from_mut_slice(&mut result_main); + let result_ref = HostSlice::from_mut_slice(&mut result_ref); + + let mut stream = IcicleStream::create().unwrap(); + let mut cfg = VecOpsConfig::default(); + cfg.stream_handle = *stream; + cfg.batch_size = 1; + + test_utilities::test_set_main_device(); + scalar_mul(a_main, b, result_main, &cfg).unwrap(); + + test_utilities::test_set_ref_device(); + scalar_mul(a_main, b, result_ref, &cfg).unwrap(); + + assert_eq!(result_main.as_slice(), result_ref.as_slice()); + + stream + .destroy() + .unwrap(); +} + pub fn check_vec_ops_scalars_accumulate(test_size: usize) where ::Config: VecOps + GenerateRandom, @@ -205,6 +396,47 @@ where assert_eq!(result_main, result_ref); } +pub fn check_slice() +where + ::Config: VecOps + GenerateRandom, +{ + let size_in: u64 = 1 << 10; + let offset: u64 = 10; + let stride: u64 = 3; + let size_out: u64 = ((size_in - offset) / stride) - 1; + + let input_matrix = F::Config::generate_random(size_in as usize); + let mut result_main = vec![F::zero(); size_out as usize]; + let mut result_ref = vec![F::zero(); size_out as usize]; + + let cfg = VecOpsConfig::default(); + test_utilities::test_set_main_device(); + slice( + HostSlice::from_slice(&input_matrix), + offset, + stride, + size_in, + size_out, + &cfg, + HostSlice::from_mut_slice(&mut result_main), + ) + .unwrap(); + + test_utilities::test_set_ref_device(); + slice( + HostSlice::from_slice(&input_matrix), + offset, + stride, + size_in, + size_out, + &cfg, + HostSlice::from_mut_slice(&mut result_ref), + ) + .unwrap(); + + assert_eq!(result_main, result_ref); +} + pub fn check_bit_reverse() where ::Config: VecOps + GenerateRandom, From 0c25f75bb0d41b39dde11c8f218cc0b8c4816986 Mon Sep 17 00:00:00 2001 From: Yuval Shekel Date: Mon, 4 Nov 2024 20:33:32 +0200 Subject: [PATCH 41/43] formatting rust --- wrappers/rust/icicle-core/src/vec_ops/mod.rs | 4 +--- wrappers/rust/icicle-core/src/vec_ops/tests.rs | 5 +++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs index 277846ee8..58e571d52 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs @@ -357,7 +357,6 @@ where <::Config as VecOps>::slice(input, offset, stride, size_in, size_out, &cfg, output) } - #[macro_export] macro_rules! impl_vec_ops_field { ( @@ -425,7 +424,6 @@ macro_rules! impl_vec_ops_field { result: *mut $field, ) -> eIcicleError; - #[link_name = concat!($field_prefix, "_vector_product")] pub(crate) fn vector_product_ffi( a: *const $field, @@ -561,7 +559,7 @@ macro_rules! impl_vec_ops_field { .wrap() } } - + fn div( a: &(impl HostOrDeviceSlice<$field> + ?Sized), b: &(impl HostOrDeviceSlice<$field> + ?Sized), diff --git a/wrappers/rust/icicle-core/src/vec_ops/tests.rs b/wrappers/rust/icicle-core/src/vec_ops/tests.rs index 4a16fcb21..0dbd4c9a3 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/tests.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/tests.rs @@ -2,8 +2,9 @@ use crate::test_utilities; use crate::traits::GenerateRandom; use crate::vec_ops::{ - accumulate_scalars, add_scalars, bit_reverse, bit_reverse_inplace, mul_scalars, slice, div_scalars, sum_scalars, scalar_add, scalar_sub, scalar_mul, product_scalars, sub_scalars, transpose_matrix, - FieldImpl, VecOps, VecOpsConfig, + accumulate_scalars, add_scalars, bit_reverse, bit_reverse_inplace, div_scalars, mul_scalars, product_scalars, + scalar_add, scalar_mul, scalar_sub, slice, sub_scalars, sum_scalars, transpose_matrix, FieldImpl, VecOps, + VecOpsConfig, }; use icicle_runtime::device::Device; use icicle_runtime::memory::{DeviceVec, HostSlice}; From dd6833b6760a18a8f1f49d16bedbfa7294c9e0dd Mon Sep 17 00:00:00 2001 From: Yuval Shekel Date: Mon, 4 Nov 2024 22:43:48 +0200 Subject: [PATCH 42/43] extension field vec ops --- icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 25 ++-- .../include/icicle/backend/vec_ops_backend.h | 77 ++++++++++-- icicle/src/vec_ops.cpp | 115 +++++++++++++++++- 3 files changed, 191 insertions(+), 26 deletions(-) diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index 913793ef5..22c257023 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -491,15 +491,6 @@ eIcicleError cpu_convert_montgomery( REGISTER_CONVERT_MONTGOMERY_BACKEND("CPU", cpu_convert_montgomery); -#ifdef EXT_FIELD -REGISTER_VECTOR_ADD_EXT_FIELD_BACKEND("CPU", cpu_vector_add); -REGISTER_VECTOR_ACCUMULATE_EXT_FIELD_BACKEND("CPU", cpu_vector_accumulate); -REGISTER_VECTOR_SUB_EXT_FIELD_BACKEND("CPU", cpu_vector_sub); -REGISTER_VECTOR_MUL_EXT_FIELD_BACKEND("CPU", cpu_vector_mul); -REGISTER_VECTOR_DIV_EXT_FIELD_BACKEND("CPU", cpu_vector_div); -REGISTER_CONVERT_MONTGOMERY_EXT_FIELD_BACKEND("CPU", cpu_convert_montgomery); -#endif // EXT_FIELD - /*********************************** SUM ***********************************/ template @@ -934,4 +925,18 @@ eIcicleError cpu_poly_divide( return eIcicleError::SUCCESS; } -REGISTER_POLYNOMIAL_DIVISION("CPU", cpu_poly_divide); \ No newline at end of file +REGISTER_POLYNOMIAL_DIVISION("CPU", cpu_poly_divide); + +#ifdef EXT_FIELD +REGISTER_VECTOR_ADD_EXT_FIELD_BACKEND("CPU", cpu_vector_add); +REGISTER_VECTOR_ACCUMULATE_EXT_FIELD_BACKEND("CPU", cpu_vector_accumulate); +REGISTER_VECTOR_SUB_EXT_FIELD_BACKEND("CPU", cpu_vector_sub); +REGISTER_VECTOR_MUL_EXT_FIELD_BACKEND("CPU", cpu_vector_mul); +REGISTER_VECTOR_DIV_EXT_FIELD_BACKEND("CPU", cpu_vector_div); +REGISTER_CONVERT_MONTGOMERY_EXT_FIELD_BACKEND("CPU", cpu_convert_montgomery); +REGISTER_VECTOR_SUM_EXT_FIELD_BACKEND("CPU", cpu_vector_sum); +REGISTER_VECTOR_PRODUCT_EXT_FIELD_BACKEND("CPU", cpu_vector_product); +REGISTER_SCALAR_MUL_VEC_EXT_FIELD_BACKEND("CPU", cpu_scalar_mul); +REGISTER_SCALAR_ADD_VEC_EXT_FIELD_BACKEND("CPU", cpu_scalar_add); +REGISTER_SCALAR_SUB_VEC_EXT_FIELD_BACKEND("CPU", cpu_scalar_sub); +#endif // EXT_FIELD \ No newline at end of file diff --git a/icicle/include/icicle/backend/vec_ops_backend.h b/icicle/include/icicle/backend/vec_ops_backend.h index 36b41760e..3739fb780 100644 --- a/icicle/include/icicle/backend/vec_ops_backend.h +++ b/icicle/include/icicle/backend/vec_ops_backend.h @@ -7,14 +7,6 @@ using namespace field_config; namespace icicle { /*************************** Backend registration ***************************/ - using vectorVectorOpImpl = std::function; - using vectorVectorOpImplInplaceA = std::function; @@ -82,7 +74,7 @@ namespace icicle { scalar_t* r_out /*OUT*/, uint64_t r_size)>; - void register_vector_add(const std::string& deviceType, vectorVectorOpImpl impl); + void register_vector_add(const std::string& deviceType, scalarVectorOpImpl impl); #define REGISTER_VECTOR_ADD_BACKEND(DEVICE_TYPE, FUNC) \ namespace { \ @@ -102,7 +94,7 @@ namespace icicle { }(); \ } - void register_vector_sub(const std::string& deviceType, vectorVectorOpImpl impl); + void register_vector_sub(const std::string& deviceType, scalarVectorOpImpl impl); #define REGISTER_VECTOR_SUB_BACKEND(DEVICE_TYPE, FUNC) \ namespace { \ static bool UNIQUE(_reg_vec_sub) = []() -> bool { \ @@ -111,7 +103,7 @@ namespace icicle { }(); \ } - void register_vector_mul(const std::string& deviceType, vectorVectorOpImpl impl); + void register_vector_mul(const std::string& deviceType, scalarVectorOpImpl impl); #define REGISTER_VECTOR_MUL_BACKEND(DEVICE_TYPE, FUNC) \ namespace { \ @@ -121,7 +113,7 @@ namespace icicle { }(); \ } - void register_vector_div(const std::string& deviceType, vectorVectorOpImpl impl); + void register_vector_div(const std::string& deviceType, scalarVectorOpImpl impl); #define REGISTER_VECTOR_DIV_BACKEND(DEVICE_TYPE, FUNC) \ namespace { \ @@ -263,6 +255,17 @@ namespace icicle { using extFieldVectorOpImplInplaceA = std::function; + using extFieldVectorReduceOpImpl = std::function; + + using extFieldVectorOpImpl = std::function; + void register_extension_vector_add(const std::string& deviceType, extFieldVectorOpImpl impl); #define REGISTER_VECTOR_ADD_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC) \ @@ -312,6 +315,56 @@ namespace icicle { }(); \ } + void register_extension_scalar_mul_vec(const std::string& deviceType, extFieldVectorOpImpl impl); + + #define REGISTER_SCALAR_MUL_VEC_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_scalar_mul_vec_ext_field) = []() -> bool { \ + register_extension_scalar_mul_vec(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + + void register_extension_scalar_add_vec(const std::string& deviceType, extFieldVectorOpImpl impl); + + #define REGISTER_SCALAR_ADD_VEC_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_scalar_add_vec_ext_field) = []() -> bool { \ + register_extension_scalar_add_vec(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + + void register_extension_scalar_sub_vec(const std::string& deviceType, extFieldVectorOpImpl impl); + + #define REGISTER_SCALAR_SUB_VEC_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_scalar_sub_vec_ext_field) = []() -> bool { \ + register_extension_scalar_sub_vec(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + + void register_extension_vector_sum(const std::string& deviceType, extFieldVectorReduceOpImpl impl); + + #define REGISTER_VECTOR_SUM_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_vec_sum_ext_field) = []() -> bool { \ + register_extension_vector_sum(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + + void register_extension_vector_product(const std::string& deviceType, extFieldVectorReduceOpImpl impl); + + #define REGISTER_VECTOR_PRODUCT_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_vec_product_ext_field) = []() -> bool { \ + register_extension_vector_product(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + using extFieldConvertMontgomeryImpl = std::function + eIcicleError vector_product(const extension_t* vec_a, uint64_t size, const VecOpsConfig& config, extension_t* output) + { + return CONCAT_EXPAND(FIELD, extension_vector_product)(vec_a, size, &config, output); + } +#endif // EXT_FIELD + /*********************************** REDUCE SUM ****************************/ ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, VectorReduceOpImpl); @@ -33,8 +49,24 @@ namespace icicle { return CONCAT_EXPAND(FIELD, vector_sum)(vec_a, size, &config, output); } +#ifdef EXT_FIELD + ICICLE_DISPATCHER_INST(VectorSumExtFieldDispatcher, extension_vector_sum, extFieldVectorReduceOpImpl); + + extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_sum)( + const extension_t* vec_a, uint64_t size, const VecOpsConfig* config, extension_t* output) + { + return VectorSumExtFieldDispatcher::execute(vec_a, size, *config, output); + } + + template <> + eIcicleError vector_sum(const extension_t* vec_a, uint64_t size, const VecOpsConfig& config, extension_t* output) + { + return CONCAT_EXPAND(FIELD, extension_vector_sum)(vec_a, size, &config, output); + } +#endif // EXT_FIELD + /*********************************** ADD ***********************************/ - ICICLE_DISPATCHER_INST(VectorAddDispatcher, vector_add, vectorVectorOpImpl); + ICICLE_DISPATCHER_INST(VectorAddDispatcher, vector_add, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_add)( const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output) @@ -99,7 +131,7 @@ namespace icicle { #endif // EXT_FIELD /*********************************** SUB ***********************************/ - ICICLE_DISPATCHER_INST(VectorSubDispatcher, vector_sub, vectorVectorOpImpl); + ICICLE_DISPATCHER_INST(VectorSubDispatcher, vector_sub, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_sub)( const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output) @@ -132,7 +164,7 @@ namespace icicle { #endif // EXT_FIELD /*********************************** MUL ***********************************/ - ICICLE_DISPATCHER_INST(VectorMulDispatcher, vector_mul, vectorVectorOpImpl); + ICICLE_DISPATCHER_INST(VectorMulDispatcher, vector_mul, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_mul)( const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output) @@ -165,7 +197,7 @@ namespace icicle { #endif // EXT_FIELD /*********************************** DIV ***********************************/ - ICICLE_DISPATCHER_INST(VectorDivDispatcher, vector_div, vectorVectorOpImpl); + ICICLE_DISPATCHER_INST(VectorDivDispatcher, vector_div, scalarVectorOpImpl); extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_div)( const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output) @@ -213,6 +245,31 @@ namespace icicle { return CONCAT_EXPAND(FIELD, scalar_add_vec)(scalar_a, vec_b, size, &config, output); } +#ifdef EXT_FIELD + ICICLE_DISPATCHER_INST(ScalarAddExtFieldDispatcher, extension_scalar_add_vec, extFieldVectorOpImpl); + + extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_scalar_add_vec)( + const extension_t* scalar_a, + const extension_t* vec_b, + uint64_t size, + const VecOpsConfig* config, + extension_t* output) + { + return ScalarAddExtFieldDispatcher::execute(scalar_a, vec_b, size, *config, output); + } + + template <> + eIcicleError scalar_add_vec( + const extension_t* scalar_a, + const extension_t* vec_b, + uint64_t size, + const VecOpsConfig& config, + extension_t* output) + { + return CONCAT_EXPAND(FIELD, extension_scalar_add_vec)(scalar_a, vec_b, size, &config, output); + } +#endif // EXT_FIELD + /*********************************** (Scalar - Vector) ELEMENT WISE ***********************************/ ICICLE_DISPATCHER_INST(ScalarSubDispatcher, scalar_sub_vec, scalarVectorOpImpl); @@ -228,6 +285,31 @@ namespace icicle { { return CONCAT_EXPAND(FIELD, scalar_sub_vec)(scalar_a, vec_b, size, &config, output); } + +#ifdef EXT_FIELD + ICICLE_DISPATCHER_INST(ScalarSubExtFieldDispatcher, extension_scalar_sub_vec, extFieldVectorOpImpl); + + extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_scalar_sub_vec)( + const extension_t* scalar_a, + const extension_t* vec_b, + uint64_t size, + const VecOpsConfig* config, + extension_t* output) + { + return ScalarSubExtFieldDispatcher::execute(scalar_a, vec_b, size, *config, output); + } + + template <> + eIcicleError scalar_sub_vec( + const extension_t* scalar_a, + const extension_t* vec_b, + uint64_t size, + const VecOpsConfig& config, + extension_t* output) + { + return CONCAT_EXPAND(FIELD, extension_scalar_sub_vec)(scalar_a, vec_b, size, &config, output); + } +#endif // EXT_FIELD /*********************************** MUL BY SCALAR ***********************************/ ICICLE_DISPATCHER_INST(ScalarMulDispatcher, scalar_mul_vec, scalarVectorOpImpl); @@ -244,6 +326,31 @@ namespace icicle { return CONCAT_EXPAND(FIELD, scalar_mul_vec)(scalar_a, vec_b, size, &config, output); } +#ifdef EXT_FIELD + ICICLE_DISPATCHER_INST(ScalarMulExtFieldDispatcher, extension_scalar_mul_vec, extFieldVectorOpImpl); + + extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_scalar_mul_vec)( + const extension_t* scalar_a, + const extension_t* vec_b, + uint64_t size, + const VecOpsConfig* config, + extension_t* output) + { + return ScalarMulExtFieldDispatcher::execute(scalar_a, vec_b, size, *config, output); + } + + template <> + eIcicleError scalar_mul_vec( + const extension_t* scalar_a, + const extension_t* vec_b, + uint64_t size, + const VecOpsConfig& config, + extension_t* output) + { + return CONCAT_EXPAND(FIELD, extension_scalar_mul_vec)(scalar_a, vec_b, size, &config, output); + } +#endif // EXT_FIELD + /*********************************** CONVERT MONTGOMERY ***********************************/ ICICLE_DISPATCHER_INST(ScalarConvertMontgomeryDispatcher, scalar_convert_montgomery, scalarConvertMontgomeryImpl) From fbb9f5506e30677a45d8a6cb13bb91fe4059aac6 Mon Sep 17 00:00:00 2001 From: Yuval Shekel Date: Mon, 4 Nov 2024 22:44:06 +0200 Subject: [PATCH 43/43] release script build v3.1 --- scripts/release/build_all.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/release/build_all.sh b/scripts/release/build_all.sh index cbb4b8860..b8050fb70 100755 --- a/scripts/release/build_all.sh +++ b/scripts/release/build_all.sh @@ -32,25 +32,25 @@ docker run --rm --gpus all \ -v ./icicle:/icicle \ -v "$output_dir:/output" \ -v ./scripts:/scripts \ - icicle-release-ubuntu22-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubuntu22 cuda122 & + icicle-release-ubuntu22-cuda122 bash /scripts/release/build_release_and_tar.sh icicle_3_1_0 ubuntu22 cuda122 & # ubuntu 20 docker run --rm --gpus all \ -v ./icicle:/icicle \ -v "$output_dir:/output" \ -v ./scripts:/scripts \ - icicle-release-ubuntu20-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubuntu20 cuda122 & + icicle-release-ubuntu20-cuda122 bash /scripts/release/build_release_and_tar.sh icicle_3_1_0 ubuntu20 cuda122 & # ubi 8 (rhel compatible) docker run --rm --gpus all \ -v ./icicle:/icicle \ -v "$output_dir:/output" \ -v ./scripts:/scripts \ - icicle-release-ubi8-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubi8 cuda122 & + icicle-release-ubi8-cuda122 bash /scripts/release/build_release_and_tar.sh icicle_3_1_0 ubi8 cuda122 & # ubi 9 (rhel compatible) docker run --rm --gpus all \ -v ./icicle:/icicle \ -v "$output_dir:/output" \ -v ./scripts:/scripts \ - icicle-release-ubi9-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubi9 cuda122 & + icicle-release-ubi9-cuda122 bash /scripts/release/build_release_and_tar.sh icicle_3_1_0 ubi9 cuda122 &