From f651c59d106fb60708448b7a5fef1c3eebbf56fa Mon Sep 17 00:00:00 2001
From: Stas Polonsky <stas@ingonyama.com>
Date: Tue, 10 Sep 2024 21:38:06 +0000
Subject: [PATCH 01/43] initial edits

---
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp |  6 ++---
 icicle/include/icicle/vec_ops.h              |  9 +++++++
 icicle/src/vec_ops.cpp                       | 25 ++++++++++++++++++++
 3 files changed, 36 insertions(+), 4 deletions(-)
diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index 3a2156d60..e3f7532aa 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -362,8 +362,7 @@ eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, co
   return eIcicleError::SUCCESS;
 }
 
-// Once backend will support - uncomment the following line
-// REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum<scalar_t>);
+REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum<scalar_t>);
 /*********************************** SUM ***********************************/
 template <typename T>
 eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output)
@@ -387,8 +386,7 @@ eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n
   return eIcicleError::SUCCESS;
 }
 
-// Once backend will support - uncomment the following line
-// REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum<scalar_t>);
+REGISTER_VECTOR_PRODUCT_BACKEND("CPU", cpu_vector_product<scalar_t>);
 
 /*********************************** MUL BY SCALAR***********************************/
 template <typename T>
diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h
index 735aaf65c..f29ccd335 100644
--- a/icicle/include/icicle/vec_ops.h
+++ b/icicle/include/icicle/vec_ops.h
@@ -46,6 +46,15 @@ namespace icicle {
     return config;
   }
 
+  // Reduction operations
+
+  template <typename T>
+  eIcicleError vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output);
+
+  template <typename T>
+  eIcicleError vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output);
+
+
   // Element-wise vector operations
 
   /**
diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp
index d42fa0dca..ad44767a5 100644
--- a/icicle/src/vec_ops.cpp
+++ b/icicle/src/vec_ops.cpp
@@ -3,6 +3,31 @@
 
 namespace icicle {
 
+
+  /*********************************** REDUCE PRODUCT ************************/
+  ICICLE_DISPATCHER_INST(VectorProductDispatcher, vector_product, scalarVectorOpImpl /* @@@ confirm this argument */);
+
+  // TODO: extern "C" for FFI
+
+  template <>
+  eIcicleError
+  vector_product(const scalar_t* vec_a, uint64_t n, const VecOpsConfig& config, scalar_t* output)
+  {
+    return CONCAT_EXPAND(FIELD, vector_product)(vec_a, n, &config, output);
+  }
+
+  /*********************************** REDUCE SUM ****************************/
+  ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, scalarVectorOpImpl /* @@@ confirm this argument */);
+
+  // TODO: extern "C" for FFI
+
+  template <>
+  eIcicleError
+  vector_sum(const scalar_t* vec_a, uint64_t n, const VecOpsConfig& config, scalar_t* output)
+  {
+    return CONCAT_EXPAND(FIELD, vector_sum)(vec_a, n, &config, output);
+  }
+
   /*********************************** ADD ***********************************/
   ICICLE_DISPATCHER_INST(VectorAddDispatcher, vector_add, scalarVectorOpImpl);
 

From 64d4414c0899def332fdce7775bb9da6b77f5312 Mon Sep 17 00:00:00 2001
From: Stas Polonsky <stas@ingonyama.com>
Date: Fri, 13 Sep 2024 21:14:56 +0000
Subject: [PATCH 02/43] vector_sum issue

---
 .../c++/vector-api/.devcontainer/Dockerfile   |  25 ++++
 .../.devcontainer/devcontainer.json           |  22 +++
 examples/c++/vector-api/CMakeLists.txt        |  16 +++
 examples/c++/vector-api/README.md             |  32 +++++
 examples/c++/vector-api/example.cpp           | 136 ++++++++++++++++++
 examples/c++/vector-api/run.sh                |  66 +++++++++
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp  |  44 +++++-
 icicle/include/icicle/api/babybear.h          |  48 +++----
 icicle/include/icicle/api/bls12_377.h         |  42 +++---
 icicle/include/icicle/api/bls12_381.h         |  42 +++---
 icicle/include/icicle/api/bn254.h             |  42 +++---
 icicle/include/icicle/api/bw6_761.h           |  42 +++---
 icicle/include/icicle/api/grumpkin.h          |  24 ++--
 icicle/include/icicle/api/stark252.h          |  10 +-
 .../include/icicle/backend/vec_ops_backend.h  |  31 ++++
 icicle/include/icicle/vec_ops.h               |  26 +++-
 icicle/src/vec_ops.cpp                        |  16 ++-
 17 files changed, 531 insertions(+), 133 deletions(-)
 create mode 100644 examples/c++/vector-api/.devcontainer/Dockerfile
 create mode 100644 examples/c++/vector-api/.devcontainer/devcontainer.json
 create mode 100644 examples/c++/vector-api/CMakeLists.txt
 create mode 100644 examples/c++/vector-api/README.md
 create mode 100644 examples/c++/vector-api/example.cpp
 create mode 100755 examples/c++/vector-api/run.sh

diff --git a/examples/c++/vector-api/.devcontainer/Dockerfile b/examples/c++/vector-api/.devcontainer/Dockerfile
new file mode 100644
index 000000000..64188da96
--- /dev/null
+++ b/examples/c++/vector-api/.devcontainer/Dockerfile
@@ -0,0 +1,25 @@
+# Make sure NVIDIA Container Toolkit is installed on your host
+
+# Use the specified base image
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    cmake \
+    curl \
+    build-essential \
+    git \
+    libboost-all-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Clone Icicle from a GitHub repository
+RUN git clone https://github.com/ingonyama-zk/icicle.git  /icicle
+
+# Set the working directory in the container
+WORKDIR /icicle-example
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
+
+
+
diff --git a/examples/c++/vector-api/.devcontainer/devcontainer.json b/examples/c++/vector-api/.devcontainer/devcontainer.json
new file mode 100644
index 000000000..490fe90a6
--- /dev/null
+++ b/examples/c++/vector-api/.devcontainer/devcontainer.json
@@ -0,0 +1,22 @@
+{
+    "name": "Icicle Examples: polynomial multiplication",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+    ],
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-vscode.cmake-tools",
+                "ms-python.python",
+                "ms-vscode.cpptools"
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/examples/c++/vector-api/CMakeLists.txt b/examples/c++/vector-api/CMakeLists.txt
new file mode 100644
index 000000000..c32f17f43
--- /dev/null
+++ b/examples/c++/vector-api/CMakeLists.txt
@@ -0,0 +1,16 @@
+cmake_minimum_required(VERSION 3.18)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+
+project(example)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
+add_executable(example example.cpp)
+target_include_directories(example PRIVATE "../../../icicle/include" "..")
+target_link_directories(example PRIVATE "${CMAKE_SOURCE_DIR}/build/icicle")
+message("${CMAKE_BINARY_DIR}/icicle")
+target_link_libraries(example PRIVATE icicle_curve_bn254 icicle_field_bn254 icicle_device)
+if(BACKEND_DIR)
+  add_compile_definitions(BACKEND_DIR="${BACKEND_DIR}")
+endif()
+
diff --git a/examples/c++/vector-api/README.md b/examples/c++/vector-api/README.md
new file mode 100644
index 000000000..46c556339
--- /dev/null
+++ b/examples/c++/vector-api/README.md
@@ -0,0 +1,32 @@
+# Icicle Example: Vector Operations API
+
+TBD
+
+## Key-Takeaway
+
+Icicle provides polynomial multiplication using the Number Theoretical Transform (NTT), including forward and inverse transforms.
+
+## Concise Usage Explanation
+
+1.	Include the necessary headers.
+2.	Initialize the NTT domain.
+3.	Prepare and transform the polynomials from host to device memory.
+4.	Perform pointwise multiplication.
+5.	Apply the inverse NTT.
+
+## Running the example
+
+```sh
+# for CPU
+./run.sh -d CPU
+# for CUDA
+./run.sh -d CUDA -b /path/to/cuda/backend/install/dir
+```
+
+## What's in the example
+
+1.	Define the size of the example.
+2.	Initialize input polynomials.
+3.	Perform Radix-2 or Mixed-Radix NTT.
+4.	Perform pointwise polynomial multiplication.
+5.	Apply the inverse NTT.
diff --git a/examples/c++/vector-api/example.cpp b/examples/c++/vector-api/example.cpp
new file mode 100644
index 000000000..ca653abe9
--- /dev/null
+++ b/examples/c++/vector-api/example.cpp
@@ -0,0 +1,136 @@
+#include <iostream>
+#include <vector>
+#include <memory>
+
+#include "icicle/runtime.h"
+#include "icicle/api/bn254.h"
+#include "icicle/utils/log.h"
+
+
+// SP: I undertstand this code is auto-generated, but I can't get scrip/gen to work. 
+
+extern "C" eIcicleError bn254_vector_product(
+  const bn254::scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result);
+
+extern "C" eIcicleError bn254_vector_sum(
+  const bn254::scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result);
+
+// SP: end of my changes
+
+using namespace bn254;
+
+#include "examples_utils.h"
+
+void random_samples(scalar_t* res, uint32_t count)
+{
+  for (int i = 0; i < count; i++)
+    res[i] = i < 1000 ? scalar_t::rand_host() : res[i - 1000];
+}
+
+// void incremental_values(scalar_t* res, uint32_t count)
+// {
+//   for (int i = 0; i < count; i++) {
+//     res[i] = i ? res[i - 1] + scalar_t::one() : scalar_t::zero();
+//   }
+// }
+
+int main(int argc, char** argv)
+{
+  try_load_and_set_backend_device(argc, argv);
+
+  int N_LOG = 20;
+  int N = 1 << N_LOG;
+
+  // on-host data
+  auto h_a = std::make_unique<scalar_t[]>(N);
+  auto h_b = std::make_unique<scalar_t[]>(N);
+  auto h_out = std::make_unique<scalar_t[]>(N);
+
+  random_samples(h_a.get(), N ); 
+  random_samples(h_b.get(), N ); 
+
+  // on-device data
+  scalar_t *d_a, *d_b, *d_out;
+
+  DeviceProperties device_props;
+  ICICLE_CHECK(icicle_get_device_properties(device_props));
+  
+  ICICLE_CHECK(icicle_malloc((void**)&d_a, sizeof(scalar_t) * N));
+  ICICLE_CHECK(icicle_malloc((void**)&d_b, sizeof(scalar_t) * N));
+  ICICLE_CHECK(icicle_malloc((void**)&d_out, sizeof(scalar_t) * N));
+
+  ICICLE_CHECK(icicle_copy(d_a, h_a.get(), sizeof(scalar_t) * N)); 
+  ICICLE_CHECK(icicle_copy(d_b, h_b.get(), sizeof(scalar_t) * N)); 
+
+  VecOpsConfig h_config{
+    nullptr,
+    false,   // is_a_on_device
+    false,   // is_b_on_device
+    false,   // is_result_on_device
+    false,  // is_async
+    nullptr // ext
+  };
+
+  VecOpsConfig d_config{
+    nullptr,
+    true,   // is_a_on_device
+    true,   // is_b_on_device
+    true,   // is_result_on_device
+    false,  // is_async
+    nullptr // ext
+  };
+
+
+  // Reduction operations
+
+  START_TIMER(baseline_reduce_sum);  
+  h_out[0] = scalar_t::zero();
+  for (uint64_t i = 0; i < N; ++i) {
+    h_out[0] = h_out[0] + h_a[i];
+  }
+  END_TIMER(baseline_reduce_sum, "baseline reduce sum took");
+
+  ICICLE_LOG_INFO << "Failed to load ";
+  std::cout << "ext: " << std::endl;
+  // d_config.ext = 2;
+  std::cout << "ext: " << d_config.ext << std::endl;
+
+  // return 0;
+
+  START_TIMER(reduce_sum);
+  ICICLE_CHECK(bn254_vector_sum(d_a, N, &d_config, d_out));
+  END_TIMER(reduce_sum, "reduce sum took");
+
+
+  std::cout << "h_out: " << h_out[0] << std::endl;
+  std::cout << "d_out: " << d_out[0] << std::endl;
+
+
+
+
+  START_TIMER(baseline_reduce_product);  
+  h_out[0] = scalar_t::one();
+  for (uint64_t i = 0; i < N; ++i) {
+    h_out[0] = h_out[0] * h_a[i];
+  }
+  END_TIMER(baseline_reduce_product, "baseline reduce product took");
+
+  
+  START_TIMER(reduce_product);
+  ICICLE_CHECK(bn254_vector_product(d_a, N, &d_config, d_out));
+  END_TIMER(reduce_product, "reduce product took");
+
+
+  std::cout << "h_out: " << h_out[0] << std::endl;
+  std::cout << "d_out: " << d_out[0] << std::endl;
+
+    
+
+  
+
+  ICICLE_CHECK(icicle_free(d_a));
+  ICICLE_CHECK(icicle_free(d_b));
+  ICICLE_CHECK(icicle_free(d_out));
+
+  return 0;
+}
\ No newline at end of file
diff --git a/examples/c++/vector-api/run.sh b/examples/c++/vector-api/run.sh
new file mode 100755
index 000000000..879390d0a
--- /dev/null
+++ b/examples/c++/vector-api/run.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Exit immediately if a command exits with a non-zero status
+set -e
+
+# Function to display usage information
+show_help() {
+  echo "Usage: $0 [-d DEVICE_TYPE] [-b ICICLE_BACKEND_INSTALL_DIR]"
+  echo
+  echo "Options:"
+  echo "  -d DEVICE_TYPE            Specify the device type (default: CPU)"
+  echo "  -b ICICLE_BACKEND_INSTALL_DIR    Specify the backend installation directory (default: empty)"
+  echo "  -h                        Show this help message"
+  exit 0
+}
+
+# Parse command line options
+while getopts ":d:b:h" opt; do
+  case ${opt} in
+    d )
+      DEVICE_TYPE=$OPTARG
+      ;;
+    b )
+      ICICLE_BACKEND_INSTALL_DIR="$(realpath ${OPTARG})"
+      ;;
+    h )
+      show_help
+      ;;
+    \? )
+      echo "Invalid option: -$OPTARG" 1>&2
+      show_help
+      ;;
+    : )
+      echo "Invalid option: -$OPTARG requires an argument" 1>&2
+      show_help
+      ;;
+  esac
+done
+
+# Set default values if not provided
+: "${DEVICE_TYPE:=CPU}"
+: "${ICICLE_BACKEND_INSTALL_DIR:=}"
+
+# Create necessary directories
+mkdir -p build/example
+mkdir -p build/icicle
+
+ICILE_DIR=$(realpath "../../../icicle/")
+ICICLE_CUDA_SOURCE_DIR="${ICILE_DIR}/backend/cuda"
+
+# Build Icicle and the example app that links to it
+if [ "$DEVICE_TYPE" == "CUDA" ] && [ ! -d "${ICICLE_BACKEND_INSTALL_DIR}" ] && [ -d "${ICICLE_CUDA_SOURCE_DIR}" ]; then
+  echo "Building icicle with CUDA backend"
+  cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DMSM=OFF -DG2=OFF -DECNTT=OFF -DCUDA_BACKEND=local -S "${ICILE_DIR}" -B build/icicle
+  export ICICLE_BACKEND_INSTALL_DIR=$(realpath "build/icicle/backend")
+else
+  echo "Building icicle without CUDA backend, ICICLE_BACKEND_INSTALL_DIR=${ICICLE_BACKEND_INSTALL_DIR}"
+  export ICICLE_BACKEND_INSTALL_DIR="${ICICLE_BACKEND_INSTALL_DIR}"
+  cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -S "${ICILE_DIR}" -B build/icicle
+fi
+cmake -DCMAKE_BUILD_TYPE=Release -S . -B build/example
+
+cmake --build build/icicle -j
+cmake --build build/example -j
+
+./build/example/example "$DEVICE_TYPE"
diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index e3f7532aa..2b0114611 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -154,7 +154,9 @@ class VectorOpTask : public TaskBase
   // Single worker functionality to execute sum(vector)
   void vector_sum()
   {
+    ICICLE_LOG_INFO << "enter vector_sum";
     *m_output = m_op_a[0];
+    ICICLE_LOG_INFO << "point 1";
     for (uint64_t i = 1; i < m_nof_operations; ++i) {
       *m_output = *m_output + m_op_a[i];
     }
@@ -242,6 +244,7 @@ class VectorOpTask : public TaskBase
   int m_bit_size;           // use in bitrev operation
   uint64_t m_stride;        // used in slice operation
   T* m_output;              // pointer to the output. Can be a vector or scalar pointer
+public:  
   T m_intermidiate_res;     // pointer to the output. Can be a vector or scalar pointer
 };
 
@@ -339,6 +342,11 @@ cpu_vector_div(const Device& device, const T* vec_a, const T* vec_b, uint64_t n,
 
 REGISTER_VECTOR_DIV_BACKEND("CPU", cpu_vector_div<scalar_t>);
 
+
+// #define SP_DEBUG
+
+#ifndef SP_DEBUG
+
 /*********************************** SUM ***********************************/
 template <typename T>
 eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output)
@@ -362,8 +370,27 @@ eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, co
   return eIcicleError::SUCCESS;
 }
 
+#else
+
+template <typename T>
+eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output)
+{
+  *output = scalar_t::zero();
+  for (uint64_t i = 0; i < n; ++i) {
+    *output = *output + vec_a[i];
+  }
+  return eIcicleError::SUCCESS;
+}
+
+#endif
+
+
 REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum<scalar_t>);
-/*********************************** SUM ***********************************/
+/*********************************** PRODUCT ***********************************/
+
+
+#ifndef SP_DEBUG
+
 template <typename T>
 eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output)
 {
@@ -379,13 +406,26 @@ eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n
     }
     if (vec_s_offset < n) {
       task_p->send_intermidiate_res_task(
-        VecOperation::VECTOR_SUM, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_s_offset), vec_a + vec_s_offset);
+        VecOperation::VECTOR_PRODUCT, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_s_offset), vec_a + vec_s_offset);
       vec_s_offset += NOF_OPERATIONS_PER_TASK;
     }
   } while (task_p != nullptr);
   return eIcicleError::SUCCESS;
 }
 
+#else
+template <typename T>
+eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output)
+{
+  *output = scalar_t::one();
+  for (uint64_t i = 0; i < n; ++i) {
+    *output = *output * vec_a[i];
+  }
+  return eIcicleError::SUCCESS;
+}
+
+#endif
+
 REGISTER_VECTOR_PRODUCT_BACKEND("CPU", cpu_vector_product<scalar_t>);
 
 /*********************************** MUL BY SCALAR***********************************/
diff --git a/icicle/include/icicle/api/babybear.h b/icicle/include/icicle/api/babybear.h
index c0104443e..0e329f4d1 100644
--- a/icicle/include/icicle/api/babybear.h
+++ b/icicle/include/icicle/api/babybear.h
@@ -9,6 +9,11 @@
 #include "icicle/ntt.h"
 #include "icicle/vec_ops.h"
 
+extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size);
+
+extern "C" void babybear_scalar_convert_montgomery(
+  const babybear::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, babybear::scalar_t* output);
+
 extern "C" eIcicleError babybear_ntt_init_domain(
   babybear::scalar_t* primitive_root, const NTTInitDomainConfig* config);
 
@@ -17,25 +22,14 @@ extern "C" eIcicleError babybear_ntt(
 
 extern "C" eIcicleError babybear_ntt_release_domain();
 
-extern "C" eIcicleError babybear_vector_mul(
-  const babybear::scalar_t* vec_a, const babybear::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* result);
-
-extern "C" eIcicleError babybear_vector_add(
-  const babybear::scalar_t* vec_a, const babybear::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* result);
-
-extern "C" eIcicleError babybear_vector_sub(
-  const babybear::scalar_t* vec_a, const babybear::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* result);
+extern "C" eIcicleError babybear_extension_ntt(
+  const babybear::extension_t* input, int size, NTTDir dir, const NTTConfig<babybear::scalar_t>* config, babybear::extension_t* output);
 
-extern "C" eIcicleError babybear_matrix_transpose(
-  const babybear::scalar_t* input,
-  uint32_t nof_rows,
-  uint32_t nof_cols,
-  const VecOpsConfig* config,
-  babybear::scalar_t* output);
 
-extern "C" eIcicleError babybear_bit_reverse(
-  const babybear::scalar_t* input, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* output);
+extern "C" void babybear_extension_generate_scalars(babybear::extension_t* scalars, int size);
 
+extern "C" eIcicleError babybear_extension_scalar_convert_montgomery(  
+  const babybear::extension_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, babybear::extension_t* output);
 
 extern "C" eIcicleError babybear_extension_vector_mul(
   const babybear::extension_t* vec_a, const babybear::extension_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::extension_t* result);
@@ -57,17 +51,23 @@ extern "C" eIcicleError babybear_extension_bit_reverse(
   const babybear::extension_t* input, uint64_t n, const VecOpsConfig* config, babybear::extension_t* output);
 
 
-extern "C" void babybear_extension_generate_scalars(babybear::extension_t* scalars, int size);
+extern "C" eIcicleError babybear_vector_mul(
+  const babybear::scalar_t* vec_a, const babybear::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* result);
 
-extern "C" eIcicleError babybear_extension_scalar_convert_montgomery(  
-  const babybear::extension_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, babybear::extension_t* output);
+extern "C" eIcicleError babybear_vector_add(
+  const babybear::scalar_t* vec_a, const babybear::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* result);
 
-extern "C" eIcicleError babybear_extension_ntt(
-  const babybear::extension_t* input, int size, NTTDir dir, const NTTConfig<babybear::scalar_t>* config, babybear::extension_t* output);
+extern "C" eIcicleError babybear_vector_sub(
+  const babybear::scalar_t* vec_a, const babybear::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* result);
 
+extern "C" eIcicleError babybear_matrix_transpose(
+  const babybear::scalar_t* input,
+  uint32_t nof_rows,
+  uint32_t nof_cols,
+  const VecOpsConfig* config,
+  babybear::scalar_t* output);
 
-extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size);
+extern "C" eIcicleError babybear_bit_reverse(
+  const babybear::scalar_t* input, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* output);
 
-extern "C" void babybear_scalar_convert_montgomery(
-  const babybear::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, babybear::scalar_t* output);
 
diff --git a/icicle/include/icicle/api/bls12_377.h b/icicle/include/icicle/api/bls12_377.h
index 8287a5102..c617dcaf9 100644
--- a/icicle/include/icicle/api/bls12_377.h
+++ b/icicle/include/icicle/api/bls12_377.h
@@ -10,19 +10,19 @@
 #include "icicle/msm.h"
 #include "icicle/vec_ops.h"
 
-extern "C" bool bls12_377_eq(bls12_377::projective_t* point1, bls12_377::projective_t* point2);
+extern "C" bool bls12_377_g2_eq(bls12_377::g2_projective_t* point1, bls12_377::g2_projective_t* point2);
 
-extern "C" void bls12_377_to_affine(bls12_377::projective_t* point, bls12_377::affine_t* point_out);
+extern "C" void bls12_377_g2_to_affine(bls12_377::g2_projective_t* point, bls12_377::g2_affine_t* point_out);
 
-extern "C" void bls12_377_generate_projective_points(bls12_377::projective_t* points, int size);
+extern "C" void bls12_377_g2_generate_projective_points(bls12_377::g2_projective_t* points, int size);
 
-extern "C" void bls12_377_generate_affine_points(bls12_377::affine_t* points, int size);
+extern "C" void bls12_377_g2_generate_affine_points(bls12_377::g2_affine_t* points, int size);
 
-extern "C" eIcicleError bls12_377_affine_convert_montgomery(
-  const bls12_377::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::affine_t* output);
+extern "C" eIcicleError bls12_377_g2_affine_convert_montgomery(
+  const bls12_377::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::g2_affine_t* output);
 
-extern "C" eIcicleError bls12_377_projective_convert_montgomery(
-  const bls12_377::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::projective_t* output);  
+extern "C" eIcicleError bls12_377_g2_projective_convert_montgomery(
+  const bls12_377::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::g2_projective_t* output);  
 
 extern "C" eIcicleError bls12_377_ecntt(
   const bls12_377::projective_t* input, int size, NTTDir dir, const NTTConfig<bls12_377::scalar_t>* config, bls12_377::projective_t* output);
@@ -37,19 +37,19 @@ extern "C" eIcicleError bls12_377_precompute_msm_bases(
 extern "C" eIcicleError bls12_377_msm(
   const bls12_377::scalar_t* scalars, const bls12_377::affine_t* points, int msm_size, const MSMConfig* config, bls12_377::projective_t* out);
 
-extern "C" bool bls12_377_g2_eq(bls12_377::g2_projective_t* point1, bls12_377::g2_projective_t* point2);
+extern "C" bool bls12_377_eq(bls12_377::projective_t* point1, bls12_377::projective_t* point2);
 
-extern "C" void bls12_377_g2_to_affine(bls12_377::g2_projective_t* point, bls12_377::g2_affine_t* point_out);
+extern "C" void bls12_377_to_affine(bls12_377::projective_t* point, bls12_377::affine_t* point_out);
 
-extern "C" void bls12_377_g2_generate_projective_points(bls12_377::g2_projective_t* points, int size);
+extern "C" void bls12_377_generate_projective_points(bls12_377::projective_t* points, int size);
 
-extern "C" void bls12_377_g2_generate_affine_points(bls12_377::g2_affine_t* points, int size);
+extern "C" void bls12_377_generate_affine_points(bls12_377::affine_t* points, int size);
 
-extern "C" eIcicleError bls12_377_g2_affine_convert_montgomery(
-  const bls12_377::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::g2_affine_t* output);
+extern "C" eIcicleError bls12_377_affine_convert_montgomery(
+  const bls12_377::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::affine_t* output);
 
-extern "C" eIcicleError bls12_377_g2_projective_convert_montgomery(
-  const bls12_377::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::g2_projective_t* output);  
+extern "C" eIcicleError bls12_377_projective_convert_montgomery(
+  const bls12_377::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::projective_t* output);  
 
 extern "C" eIcicleError bls12_377_g2_precompute_msm_bases(
   const bls12_377::g2_affine_t* bases,
@@ -60,6 +60,11 @@ extern "C" eIcicleError bls12_377_g2_precompute_msm_bases(
 extern "C" eIcicleError bls12_377_g2_msm(
   const bls12_377::scalar_t* scalars, const bls12_377::g2_affine_t* points, int msm_size, const MSMConfig* config, bls12_377::g2_projective_t* out);
 
+extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size);
+
+extern "C" void bls12_377_scalar_convert_montgomery(
+  const bls12_377::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bls12_377::scalar_t* output);
+
 extern "C" eIcicleError bls12_377_ntt_init_domain(
   bls12_377::scalar_t* primitive_root, const NTTInitDomainConfig* config);
 
@@ -88,8 +93,3 @@ extern "C" eIcicleError bls12_377_bit_reverse(
   const bls12_377::scalar_t* input, uint64_t n, const VecOpsConfig* config, bls12_377::scalar_t* output);
 
 
-extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size);
-
-extern "C" void bls12_377_scalar_convert_montgomery(
-  const bls12_377::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bls12_377::scalar_t* output);
-
diff --git a/icicle/include/icicle/api/bls12_381.h b/icicle/include/icicle/api/bls12_381.h
index d2b7d6999..361731586 100644
--- a/icicle/include/icicle/api/bls12_381.h
+++ b/icicle/include/icicle/api/bls12_381.h
@@ -10,19 +10,19 @@
 #include "icicle/msm.h"
 #include "icicle/vec_ops.h"
 
-extern "C" bool bls12_381_eq(bls12_381::projective_t* point1, bls12_381::projective_t* point2);
+extern "C" bool bls12_381_g2_eq(bls12_381::g2_projective_t* point1, bls12_381::g2_projective_t* point2);
 
-extern "C" void bls12_381_to_affine(bls12_381::projective_t* point, bls12_381::affine_t* point_out);
+extern "C" void bls12_381_g2_to_affine(bls12_381::g2_projective_t* point, bls12_381::g2_affine_t* point_out);
 
-extern "C" void bls12_381_generate_projective_points(bls12_381::projective_t* points, int size);
+extern "C" void bls12_381_g2_generate_projective_points(bls12_381::g2_projective_t* points, int size);
 
-extern "C" void bls12_381_generate_affine_points(bls12_381::affine_t* points, int size);
+extern "C" void bls12_381_g2_generate_affine_points(bls12_381::g2_affine_t* points, int size);
 
-extern "C" eIcicleError bls12_381_affine_convert_montgomery(
-  const bls12_381::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::affine_t* output);
+extern "C" eIcicleError bls12_381_g2_affine_convert_montgomery(
+  const bls12_381::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::g2_affine_t* output);
 
-extern "C" eIcicleError bls12_381_projective_convert_montgomery(
-  const bls12_381::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::projective_t* output);  
+extern "C" eIcicleError bls12_381_g2_projective_convert_montgomery(
+  const bls12_381::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::g2_projective_t* output);  
 
 extern "C" eIcicleError bls12_381_ecntt(
   const bls12_381::projective_t* input, int size, NTTDir dir, const NTTConfig<bls12_381::scalar_t>* config, bls12_381::projective_t* output);
@@ -37,19 +37,19 @@ extern "C" eIcicleError bls12_381_precompute_msm_bases(
 extern "C" eIcicleError bls12_381_msm(
   const bls12_381::scalar_t* scalars, const bls12_381::affine_t* points, int msm_size, const MSMConfig* config, bls12_381::projective_t* out);
 
-extern "C" bool bls12_381_g2_eq(bls12_381::g2_projective_t* point1, bls12_381::g2_projective_t* point2);
+extern "C" bool bls12_381_eq(bls12_381::projective_t* point1, bls12_381::projective_t* point2);
 
-extern "C" void bls12_381_g2_to_affine(bls12_381::g2_projective_t* point, bls12_381::g2_affine_t* point_out);
+extern "C" void bls12_381_to_affine(bls12_381::projective_t* point, bls12_381::affine_t* point_out);
 
-extern "C" void bls12_381_g2_generate_projective_points(bls12_381::g2_projective_t* points, int size);
+extern "C" void bls12_381_generate_projective_points(bls12_381::projective_t* points, int size);
 
-extern "C" void bls12_381_g2_generate_affine_points(bls12_381::g2_affine_t* points, int size);
+extern "C" void bls12_381_generate_affine_points(bls12_381::affine_t* points, int size);
 
-extern "C" eIcicleError bls12_381_g2_affine_convert_montgomery(
-  const bls12_381::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::g2_affine_t* output);
+extern "C" eIcicleError bls12_381_affine_convert_montgomery(
+  const bls12_381::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::affine_t* output);
 
-extern "C" eIcicleError bls12_381_g2_projective_convert_montgomery(
-  const bls12_381::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::g2_projective_t* output);  
+extern "C" eIcicleError bls12_381_projective_convert_montgomery(
+  const bls12_381::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::projective_t* output);  
 
 extern "C" eIcicleError bls12_381_g2_precompute_msm_bases(
   const bls12_381::g2_affine_t* bases,
@@ -60,6 +60,11 @@ extern "C" eIcicleError bls12_381_g2_precompute_msm_bases(
 extern "C" eIcicleError bls12_381_g2_msm(
   const bls12_381::scalar_t* scalars, const bls12_381::g2_affine_t* points, int msm_size, const MSMConfig* config, bls12_381::g2_projective_t* out);
 
+extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size);
+
+extern "C" void bls12_381_scalar_convert_montgomery(
+  const bls12_381::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bls12_381::scalar_t* output);
+
 extern "C" eIcicleError bls12_381_ntt_init_domain(
   bls12_381::scalar_t* primitive_root, const NTTInitDomainConfig* config);
 
@@ -88,8 +93,3 @@ extern "C" eIcicleError bls12_381_bit_reverse(
   const bls12_381::scalar_t* input, uint64_t n, const VecOpsConfig* config, bls12_381::scalar_t* output);
 
 
-extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size);
-
-extern "C" void bls12_381_scalar_convert_montgomery(
-  const bls12_381::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bls12_381::scalar_t* output);
-
diff --git a/icicle/include/icicle/api/bn254.h b/icicle/include/icicle/api/bn254.h
index d054f23b4..928cb639e 100644
--- a/icicle/include/icicle/api/bn254.h
+++ b/icicle/include/icicle/api/bn254.h
@@ -10,19 +10,19 @@
 #include "icicle/msm.h"
 #include "icicle/vec_ops.h"
 
-extern "C" bool bn254_eq(bn254::projective_t* point1, bn254::projective_t* point2);
+extern "C" bool bn254_g2_eq(bn254::g2_projective_t* point1, bn254::g2_projective_t* point2);
 
-extern "C" void bn254_to_affine(bn254::projective_t* point, bn254::affine_t* point_out);
+extern "C" void bn254_g2_to_affine(bn254::g2_projective_t* point, bn254::g2_affine_t* point_out);
 
-extern "C" void bn254_generate_projective_points(bn254::projective_t* points, int size);
+extern "C" void bn254_g2_generate_projective_points(bn254::g2_projective_t* points, int size);
 
-extern "C" void bn254_generate_affine_points(bn254::affine_t* points, int size);
+extern "C" void bn254_g2_generate_affine_points(bn254::g2_affine_t* points, int size);
 
-extern "C" eIcicleError bn254_affine_convert_montgomery(
-  const bn254::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::affine_t* output);
+extern "C" eIcicleError bn254_g2_affine_convert_montgomery(
+  const bn254::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::g2_affine_t* output);
 
-extern "C" eIcicleError bn254_projective_convert_montgomery(
-  const bn254::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::projective_t* output);  
+extern "C" eIcicleError bn254_g2_projective_convert_montgomery(
+  const bn254::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::g2_projective_t* output);  
 
 extern "C" eIcicleError bn254_ecntt(
   const bn254::projective_t* input, int size, NTTDir dir, const NTTConfig<bn254::scalar_t>* config, bn254::projective_t* output);
@@ -37,19 +37,19 @@ extern "C" eIcicleError bn254_precompute_msm_bases(
 extern "C" eIcicleError bn254_msm(
   const bn254::scalar_t* scalars, const bn254::affine_t* points, int msm_size, const MSMConfig* config, bn254::projective_t* out);
 
-extern "C" bool bn254_g2_eq(bn254::g2_projective_t* point1, bn254::g2_projective_t* point2);
+extern "C" bool bn254_eq(bn254::projective_t* point1, bn254::projective_t* point2);
 
-extern "C" void bn254_g2_to_affine(bn254::g2_projective_t* point, bn254::g2_affine_t* point_out);
+extern "C" void bn254_to_affine(bn254::projective_t* point, bn254::affine_t* point_out);
 
-extern "C" void bn254_g2_generate_projective_points(bn254::g2_projective_t* points, int size);
+extern "C" void bn254_generate_projective_points(bn254::projective_t* points, int size);
 
-extern "C" void bn254_g2_generate_affine_points(bn254::g2_affine_t* points, int size);
+extern "C" void bn254_generate_affine_points(bn254::affine_t* points, int size);
 
-extern "C" eIcicleError bn254_g2_affine_convert_montgomery(
-  const bn254::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::g2_affine_t* output);
+extern "C" eIcicleError bn254_affine_convert_montgomery(
+  const bn254::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::affine_t* output);
 
-extern "C" eIcicleError bn254_g2_projective_convert_montgomery(
-  const bn254::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::g2_projective_t* output);  
+extern "C" eIcicleError bn254_projective_convert_montgomery(
+  const bn254::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::projective_t* output);  
 
 extern "C" eIcicleError bn254_g2_precompute_msm_bases(
   const bn254::g2_affine_t* bases,
@@ -60,6 +60,11 @@ extern "C" eIcicleError bn254_g2_precompute_msm_bases(
 extern "C" eIcicleError bn254_g2_msm(
   const bn254::scalar_t* scalars, const bn254::g2_affine_t* points, int msm_size, const MSMConfig* config, bn254::g2_projective_t* out);
 
+extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size);
+
+extern "C" void bn254_scalar_convert_montgomery(
+  const bn254::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bn254::scalar_t* output);
+
 extern "C" eIcicleError bn254_ntt_init_domain(
   bn254::scalar_t* primitive_root, const NTTInitDomainConfig* config);
 
@@ -88,8 +93,3 @@ extern "C" eIcicleError bn254_bit_reverse(
   const bn254::scalar_t* input, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* output);
 
 
-extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size);
-
-extern "C" void bn254_scalar_convert_montgomery(
-  const bn254::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bn254::scalar_t* output);
-
diff --git a/icicle/include/icicle/api/bw6_761.h b/icicle/include/icicle/api/bw6_761.h
index 31d3b87e2..6b48606a2 100644
--- a/icicle/include/icicle/api/bw6_761.h
+++ b/icicle/include/icicle/api/bw6_761.h
@@ -10,19 +10,19 @@
 #include "icicle/msm.h"
 #include "icicle/vec_ops.h"
 
-extern "C" bool bw6_761_eq(bw6_761::projective_t* point1, bw6_761::projective_t* point2);
+extern "C" bool bw6_761_g2_eq(bw6_761::g2_projective_t* point1, bw6_761::g2_projective_t* point2);
 
-extern "C" void bw6_761_to_affine(bw6_761::projective_t* point, bw6_761::affine_t* point_out);
+extern "C" void bw6_761_g2_to_affine(bw6_761::g2_projective_t* point, bw6_761::g2_affine_t* point_out);
 
-extern "C" void bw6_761_generate_projective_points(bw6_761::projective_t* points, int size);
+extern "C" void bw6_761_g2_generate_projective_points(bw6_761::g2_projective_t* points, int size);
 
-extern "C" void bw6_761_generate_affine_points(bw6_761::affine_t* points, int size);
+extern "C" void bw6_761_g2_generate_affine_points(bw6_761::g2_affine_t* points, int size);
 
-extern "C" eIcicleError bw6_761_affine_convert_montgomery(
-  const bw6_761::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::affine_t* output);
+extern "C" eIcicleError bw6_761_g2_affine_convert_montgomery(
+  const bw6_761::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::g2_affine_t* output);
 
-extern "C" eIcicleError bw6_761_projective_convert_montgomery(
-  const bw6_761::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::projective_t* output);  
+extern "C" eIcicleError bw6_761_g2_projective_convert_montgomery(
+  const bw6_761::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::g2_projective_t* output);  
 
 extern "C" eIcicleError bw6_761_ecntt(
   const bw6_761::projective_t* input, int size, NTTDir dir, const NTTConfig<bw6_761::scalar_t>* config, bw6_761::projective_t* output);
@@ -37,19 +37,19 @@ extern "C" eIcicleError bw6_761_precompute_msm_bases(
 extern "C" eIcicleError bw6_761_msm(
   const bw6_761::scalar_t* scalars, const bw6_761::affine_t* points, int msm_size, const MSMConfig* config, bw6_761::projective_t* out);
 
-extern "C" bool bw6_761_g2_eq(bw6_761::g2_projective_t* point1, bw6_761::g2_projective_t* point2);
+extern "C" bool bw6_761_eq(bw6_761::projective_t* point1, bw6_761::projective_t* point2);
 
-extern "C" void bw6_761_g2_to_affine(bw6_761::g2_projective_t* point, bw6_761::g2_affine_t* point_out);
+extern "C" void bw6_761_to_affine(bw6_761::projective_t* point, bw6_761::affine_t* point_out);
 
-extern "C" void bw6_761_g2_generate_projective_points(bw6_761::g2_projective_t* points, int size);
+extern "C" void bw6_761_generate_projective_points(bw6_761::projective_t* points, int size);
 
-extern "C" void bw6_761_g2_generate_affine_points(bw6_761::g2_affine_t* points, int size);
+extern "C" void bw6_761_generate_affine_points(bw6_761::affine_t* points, int size);
 
-extern "C" eIcicleError bw6_761_g2_affine_convert_montgomery(
-  const bw6_761::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::g2_affine_t* output);
+extern "C" eIcicleError bw6_761_affine_convert_montgomery(
+  const bw6_761::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::affine_t* output);
 
-extern "C" eIcicleError bw6_761_g2_projective_convert_montgomery(
-  const bw6_761::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::g2_projective_t* output);  
+extern "C" eIcicleError bw6_761_projective_convert_montgomery(
+  const bw6_761::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::projective_t* output);  
 
 extern "C" eIcicleError bw6_761_g2_precompute_msm_bases(
   const bw6_761::g2_affine_t* bases,
@@ -60,6 +60,11 @@ extern "C" eIcicleError bw6_761_g2_precompute_msm_bases(
 extern "C" eIcicleError bw6_761_g2_msm(
   const bw6_761::scalar_t* scalars, const bw6_761::g2_affine_t* points, int msm_size, const MSMConfig* config, bw6_761::g2_projective_t* out);
 
+extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size);
+
+extern "C" void bw6_761_scalar_convert_montgomery(
+  const bw6_761::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bw6_761::scalar_t* output);
+
 extern "C" eIcicleError bw6_761_ntt_init_domain(
   bw6_761::scalar_t* primitive_root, const NTTInitDomainConfig* config);
 
@@ -88,8 +93,3 @@ extern "C" eIcicleError bw6_761_bit_reverse(
   const bw6_761::scalar_t* input, uint64_t n, const VecOpsConfig* config, bw6_761::scalar_t* output);
 
 
-extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size);
-
-extern "C" void bw6_761_scalar_convert_montgomery(
-  const bw6_761::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bw6_761::scalar_t* output);
-
diff --git a/icicle/include/icicle/api/grumpkin.h b/icicle/include/icicle/api/grumpkin.h
index 9908e492b..42b1b2195 100644
--- a/icicle/include/icicle/api/grumpkin.h
+++ b/icicle/include/icicle/api/grumpkin.h
@@ -9,6 +9,15 @@
 #include "icicle/msm.h"
 #include "icicle/vec_ops.h"
 
+extern "C" eIcicleError grumpkin_precompute_msm_bases(
+  const grumpkin::affine_t* bases,
+  int nof_bases,
+  const MSMConfig* config,
+  grumpkin::affine_t* output_bases);
+
+extern "C" eIcicleError grumpkin_msm(
+  const grumpkin::scalar_t* scalars, const grumpkin::affine_t* points, int msm_size, const MSMConfig* config, grumpkin::projective_t* out);
+
 extern "C" bool grumpkin_eq(grumpkin::projective_t* point1, grumpkin::projective_t* point2);
 
 extern "C" void grumpkin_to_affine(grumpkin::projective_t* point, grumpkin::affine_t* point_out);
@@ -23,14 +32,10 @@ extern "C" eIcicleError grumpkin_affine_convert_montgomery(
 extern "C" eIcicleError grumpkin_projective_convert_montgomery(
   const grumpkin::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, grumpkin::projective_t* output);  
 
-extern "C" eIcicleError grumpkin_precompute_msm_bases(
-  const grumpkin::affine_t* bases,
-  int nof_bases,
-  const MSMConfig* config,
-  grumpkin::affine_t* output_bases);
+extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size);
 
-extern "C" eIcicleError grumpkin_msm(
-  const grumpkin::scalar_t* scalars, const grumpkin::affine_t* points, int msm_size, const MSMConfig* config, grumpkin::projective_t* out);
+extern "C" void grumpkin_scalar_convert_montgomery(
+  const grumpkin::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, grumpkin::scalar_t* output);
 
 extern "C" eIcicleError grumpkin_vector_mul(
   const grumpkin::scalar_t* vec_a, const grumpkin::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, grumpkin::scalar_t* result);
@@ -52,8 +57,3 @@ extern "C" eIcicleError grumpkin_bit_reverse(
   const grumpkin::scalar_t* input, uint64_t n, const VecOpsConfig* config, grumpkin::scalar_t* output);
 
 
-extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size);
-
-extern "C" void grumpkin_scalar_convert_montgomery(
-  const grumpkin::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, grumpkin::scalar_t* output);
-
diff --git a/icicle/include/icicle/api/stark252.h b/icicle/include/icicle/api/stark252.h
index 3bbe9626f..6a8ff1a74 100644
--- a/icicle/include/icicle/api/stark252.h
+++ b/icicle/include/icicle/api/stark252.h
@@ -9,6 +9,11 @@
 #include "icicle/ntt.h"
 #include "icicle/vec_ops.h"
 
+extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size);
+
+extern "C" void stark252_scalar_convert_montgomery(
+  const stark252::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, stark252::scalar_t* output);
+
 extern "C" eIcicleError stark252_ntt_init_domain(
   stark252::scalar_t* primitive_root, const NTTInitDomainConfig* config);
 
@@ -37,8 +42,3 @@ extern "C" eIcicleError stark252_bit_reverse(
   const stark252::scalar_t* input, uint64_t n, const VecOpsConfig* config, stark252::scalar_t* output);
 
 
-extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size);
-
-extern "C" void stark252_scalar_convert_montgomery(
-  const stark252::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, stark252::scalar_t* output);
-
diff --git a/icicle/include/icicle/backend/vec_ops_backend.h b/icicle/include/icicle/backend/vec_ops_backend.h
index 74502a9a4..3914e750a 100644
--- a/icicle/include/icicle/backend/vec_ops_backend.h
+++ b/icicle/include/icicle/backend/vec_ops_backend.h
@@ -7,6 +7,15 @@ using namespace field_config;
 namespace icicle {
   /*************************** Backend registration ***************************/
 
+  using scalarVectorReduceOpImpl = std::function<eIcicleError(
+    const Device& device,
+    const scalar_t* vec_a,
+    uint64_t n,
+    const VecOpsConfig& config,
+    scalar_t* output)>;
+
+
+
   using scalarVectorOpImpl = std::function<eIcicleError(
     const Device& device,
     const scalar_t* vec_a,
@@ -18,6 +27,28 @@ namespace icicle {
   using scalarVectorOpImplInplaceA = std::function<eIcicleError(
     const Device& device, scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config)>;
 
+  void register_vector_sum(const std::string& deviceType, scalarVectorReduceOpImpl impl);
+
+#define REGISTER_VECTOR_SUM_BACKEND(DEVICE_TYPE, FUNC)                                                                 \
+  namespace {                                                                                                          \
+    static bool UNIQUE(_reg_vec_sum) = []() -> bool {                                                                  \
+      register_vector_sum(DEVICE_TYPE, FUNC);                                                                          \
+      return true;                                                                                                     \
+    }();                                                                                                               \
+  }
+
+  void register_vector_product(const std::string& deviceType, scalarVectorReduceOpImpl impl);
+
+#define REGISTER_VECTOR_PRODUCT_BACKEND(DEVICE_TYPE, FUNC)                                                                 \
+  namespace {                                                                                                          \
+    static bool UNIQUE(_reg_vec_product) = []() -> bool {                                                                  \
+      register_vector_product(DEVICE_TYPE, FUNC);                                                                          \
+      return true;                                                                                                     \
+    }();                                                                                                               \
+  }
+
+
+
   void register_vector_add(const std::string& deviceType, scalarVectorOpImpl impl);
 
 #define REGISTER_VECTOR_ADD_BACKEND(DEVICE_TYPE, FUNC)                                                                 \
diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h
index f29ccd335..322ed0c81 100644
--- a/icicle/include/icicle/vec_ops.h
+++ b/icicle/include/icicle/vec_ops.h
@@ -48,11 +48,33 @@ namespace icicle {
 
   // Reduction operations
 
+  /**
+   * @brief Computes the product of all elements in a vector.
+   *
+   * @tparam T Type of the elements in the vector.
+   * @param vec_a Input vector.
+   * @param n Number of elements in the vector.
+   * @param config Configuration for the operation.
+   * @param output Output scalar to store the result.
+   * @return eIcicleError Error code indicating success or failure.
+   */
+
   template <typename T>
-  eIcicleError vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output);
+  eIcicleError vector_product(const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output);
+
+  /**
+   * @brief Computes the sum of all elements in a vector.
+   *
+   * @tparam T Type of the elements in the vector.
+   * @param vec_a Input vector.
+   * @param n Number of elements in the vector.
+   * @param config Configuration for the operation.
+   * @param output Output scalar to store the result.
+   * @return eIcicleError Error code indicating success or failure.
+   */
 
   template <typename T>
-  eIcicleError vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output);
+  eIcicleError vector_sum(const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output);
 
 
   // Element-wise vector operations
diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp
index ad44767a5..e0acd0091 100644
--- a/icicle/src/vec_ops.cpp
+++ b/icicle/src/vec_ops.cpp
@@ -5,9 +5,13 @@ namespace icicle {
 
 
   /*********************************** REDUCE PRODUCT ************************/
-  ICICLE_DISPATCHER_INST(VectorProductDispatcher, vector_product, scalarVectorOpImpl /* @@@ confirm this argument */);
+  ICICLE_DISPATCHER_INST(VectorProductDispatcher, vector_product, scalarVectorReduceOpImpl);
 
-  // TODO: extern "C" for FFI
+  extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_product)(
+    const scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, scalar_t* output)
+  {
+    return VectorProductDispatcher::execute(vec_a, n, *config, output);
+  }
 
   template <>
   eIcicleError
@@ -17,9 +21,13 @@ namespace icicle {
   }
 
   /*********************************** REDUCE SUM ****************************/
-  ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, scalarVectorOpImpl /* @@@ confirm this argument */);
+  ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, scalarVectorReduceOpImpl );
 
-  // TODO: extern "C" for FFI
+  extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_sum)(
+    const scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, scalar_t* output)
+  {
+    return VectorSumDispatcher::execute(vec_a, n, *config, output);
+  }
 
   template <>
   eIcicleError

From 04351fbfc217ad6edb2c20583fc77f4c0a6343ed Mon Sep 17 00:00:00 2001
From: Stas Polonsky <stas@ingonyama.com>
Date: Mon, 16 Sep 2024 13:44:23 +0000
Subject: [PATCH 03/43] for Miki

---
 examples/c++/vector-api/example.cpp          | 11 ++--------
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 22 ++++++++++++++++++--
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/examples/c++/vector-api/example.cpp b/examples/c++/vector-api/example.cpp
index ca653abe9..953ddbb84 100644
--- a/examples/c++/vector-api/example.cpp
+++ b/examples/c++/vector-api/example.cpp
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
 {
   try_load_and_set_backend_device(argc, argv);
 
-  int N_LOG = 20;
+  int N_LOG = 10;
   int N = 1 << N_LOG;
 
   // on-host data
@@ -90,13 +90,6 @@ int main(int argc, char** argv)
   }
   END_TIMER(baseline_reduce_sum, "baseline reduce sum took");
 
-  ICICLE_LOG_INFO << "Failed to load ";
-  std::cout << "ext: " << std::endl;
-  // d_config.ext = 2;
-  std::cout << "ext: " << d_config.ext << std::endl;
-
-  // return 0;
-
   START_TIMER(reduce_sum);
   ICICLE_CHECK(bn254_vector_sum(d_a, N, &d_config, d_out));
   END_TIMER(reduce_sum, "reduce sum took");
@@ -106,7 +99,7 @@ int main(int argc, char** argv)
   std::cout << "d_out: " << d_out[0] << std::endl;
 
 
-
+  return 0;
 
   START_TIMER(baseline_reduce_product);  
   h_out[0] = scalar_t::one();
diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index 2b0114611..939604e45 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -71,6 +71,7 @@ class VectorOpTask : public TaskBase
     m_operation = operation;
     m_nof_operations = nof_operations;
     m_op_a = op_a;
+    // SP: where is m_output?
     dispatch();
   }
 
@@ -155,10 +156,14 @@ class VectorOpTask : public TaskBase
   void vector_sum()
   {
     ICICLE_LOG_INFO << "enter vector_sum";
-    *m_output = m_op_a[0];
+    ICICLE_LOG_INFO << "m_op_a[0]: " << m_op_a[0];
+    ICICLE_LOG_INFO << "point 0";
+    // *m_output = m_op_a[0];
+    m_intermidiate_res = m_op_a[0];
     ICICLE_LOG_INFO << "point 1";
     for (uint64_t i = 1; i < m_nof_operations; ++i) {
-      *m_output = *m_output + m_op_a[i];
+      // *m_output = *m_output + m_op_a[i];
+      m_intermidiate_res = m_intermidiate_res + m_op_a[i];
     }
   }
   // Single worker functionality to execute product(vector)
@@ -351,6 +356,7 @@ REGISTER_VECTOR_DIV_BACKEND("CPU", cpu_vector_div<scalar_t>);
 template <typename T>
 eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output)
 {
+  ICICLE_LOG_INFO << "cpu_vector_sum";
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
   bool output_initialized = false;
   uint64_t vec_s_offset = 0;
@@ -359,14 +365,24 @@ eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, co
   do {
     task_p = vec_s_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task();
     if (task_p->is_completed()) {
+      ICICLE_LOG_INFO << "task_p->m_intermidiate_res: " << task_p->m_intermidiate_res;
       *output = output_initialized ? task_p->m_intermidiate_res : *output + task_p->m_intermidiate_res;
+      // SP: we used m_intermidiate_res, we have to mark it so we can't use it again. set_idle?
+      // SP: Use dispatch if setting a new task, or set_idle if to just mark the task result as handled.
+      // output_initialized = true;
+      // task_p->set_idle();
+      ICICLE_LOG_INFO << "after set_idle";
+      ICICLE_LOG_INFO << "is_completed: " << task_p->is_completed();
     }
     if (vec_s_offset < n) {
+      ICICLE_LOG_INFO << "vec_s_offset: " << vec_s_offset;
       task_p->send_intermidiate_res_task(
         VecOperation::VECTOR_SUM, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_s_offset), vec_a + vec_s_offset);
       vec_s_offset += NOF_OPERATIONS_PER_TASK;
     }
+    ICICLE_LOG_INFO << "task_p: " << task_p;
   } while (task_p != nullptr);
+  // } while (vec_s_offset < n);
   return eIcicleError::SUCCESS;
 }
 
@@ -394,6 +410,7 @@ REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum<scalar_t>);
 template <typename T>
 eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output)
 {
+  ICICLE_LOG_INFO << "cpu_vector_product";
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
   bool output_initialized = false;
   uint64_t vec_s_offset = 0;
@@ -405,6 +422,7 @@ eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n
       *output = output_initialized ? task_p->m_intermidiate_res : *output * task_p->m_intermidiate_res;
     }
     if (vec_s_offset < n) {
+      ICICLE_LOG_INFO << "vec_s_offset: " << vec_s_offset;
       task_p->send_intermidiate_res_task(
         VecOperation::VECTOR_PRODUCT, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_s_offset), vec_a + vec_s_offset);
       vec_s_offset += NOF_OPERATIONS_PER_TASK;

From f3086d4f38b8e0569faa833450acf671a3809e35 Mon Sep 17 00:00:00 2001
From: Stas Polonsky <stas@ingonyama.com>
Date: Mon, 16 Sep 2024 18:40:24 +0000
Subject: [PATCH 04/43] debugged reduction ops

---
 examples/c++/vector-api/README.md            |  26 ++---
 examples/c++/vector-api/example.cpp          |  34 ++++--
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 113 ++++++-------------
 3 files changed, 72 insertions(+), 101 deletions(-)

diff --git a/examples/c++/vector-api/README.md b/examples/c++/vector-api/README.md
index 46c556339..120156c9f 100644
--- a/examples/c++/vector-api/README.md
+++ b/examples/c++/vector-api/README.md
@@ -1,18 +1,16 @@
 # Icicle Example: Vector Operations API
 
-TBD
-
 ## Key-Takeaway
 
-Icicle provides polynomial multiplication using the Number Theoretical Transform (NTT), including forward and inverse transforms.
-
-## Concise Usage Explanation
+The Vector Operations API supports the following:
 
-1.	Include the necessary headers.
-2.	Initialize the NTT domain.
-3.	Prepare and transform the polynomials from host to device memory.
-4.	Perform pointwise multiplication.
-5.	Apply the inverse NTT.
+ - element-wise vector operations (e.g. addition, multiplication)
+ - vector reduction operations (e.g. sum of elements, product of elements)
+ - scalar-vector operations (e.g add scalar to vector)
+ - matrix operations (e.g. transposition)
+ - miscellaneous operations like bit-reversal and slicing. 
+ 
+ All these operations can be performed on a host or device both synchronously and asynchronously.
 
 ## Running the example
 
@@ -25,8 +23,6 @@ Icicle provides polynomial multiplication using the Number Theoretical Transform
 
 ## What's in the example
 
-1.	Define the size of the example.
-2.	Initialize input polynomials.
-3.	Perform Radix-2 or Mixed-Radix NTT.
-4.	Perform pointwise polynomial multiplication.
-5.	Apply the inverse NTT.
+1.	`example_element_wise`: examples of element-wise operations
+2.	`example_scalar_vector`: examples of scalar-vector operations
+
diff --git a/examples/c++/vector-api/example.cpp b/examples/c++/vector-api/example.cpp
index 953ddbb84..10ee787cb 100644
--- a/examples/c++/vector-api/example.cpp
+++ b/examples/c++/vector-api/example.cpp
@@ -27,18 +27,23 @@ void random_samples(scalar_t* res, uint32_t count)
     res[i] = i < 1000 ? scalar_t::rand_host() : res[i - 1000];
 }
 
-// void incremental_values(scalar_t* res, uint32_t count)
-// {
-//   for (int i = 0; i < count; i++) {
-//     res[i] = i ? res[i - 1] + scalar_t::one() : scalar_t::zero();
-//   }
-// }
+void incremental_values(scalar_t* res, uint32_t count)
+{
+  for (int i = 0; i < count; i++) {
+    res[i] = i ? res[i - 1] + scalar_t::one() : scalar_t::zero();
+  }
+}
+
+
+void example_element_wise() {
+  return;
+}
 
 int main(int argc, char** argv)
 {
-  try_load_and_set_backend_device(argc, argv);
+  // try_load_and_set_backend_device(argc, argv);
 
-  int N_LOG = 10;
+  int N_LOG = 20;
   int N = 1 << N_LOG;
 
   // on-host data
@@ -49,11 +54,19 @@ int main(int argc, char** argv)
   random_samples(h_a.get(), N ); 
   random_samples(h_b.get(), N ); 
 
+  // incremental_values(h_a.get(), N ); 
+  // incremental_values(h_b.get(), N ); 
+
   // on-device data
   scalar_t *d_a, *d_b, *d_out;
 
   DeviceProperties device_props;
   ICICLE_CHECK(icicle_get_device_properties(device_props));
+  if (!device_props.using_host_memory) {
+    std::cout << "Device isn't using host memory" << std::endl;
+  } else {
+    std::cout << "Device is using host memory" << std::endl;
+  }  
   
   ICICLE_CHECK(icicle_malloc((void**)&d_a, sizeof(scalar_t) * N));
   ICICLE_CHECK(icicle_malloc((void**)&d_b, sizeof(scalar_t) * N));
@@ -91,7 +104,8 @@ int main(int argc, char** argv)
   END_TIMER(baseline_reduce_sum, "baseline reduce sum took");
 
   START_TIMER(reduce_sum);
-  ICICLE_CHECK(bn254_vector_sum(d_a, N, &d_config, d_out));
+  ICICLE_CHECK(bn254_vector_sum(d_a, N, &h_config, d_out));
+  // ICICLE_CHECK(bn254_vector_sum(d_a, N, &d_config, d_out));
   END_TIMER(reduce_sum, "reduce sum took");
 
 
@@ -99,7 +113,7 @@ int main(int argc, char** argv)
   std::cout << "d_out: " << d_out[0] << std::endl;
 
 
-  return 0;
+  // return 0;
 
   START_TIMER(baseline_reduce_product);  
   h_out[0] = scalar_t::one();
diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index 939604e45..a06c12a50 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -155,23 +155,21 @@ class VectorOpTask : public TaskBase
   // Single worker functionality to execute sum(vector)
   void vector_sum()
   {
-    ICICLE_LOG_INFO << "enter vector_sum";
-    ICICLE_LOG_INFO << "m_op_a[0]: " << m_op_a[0];
-    ICICLE_LOG_INFO << "point 0";
-    // *m_output = m_op_a[0];
+    // SP: *m_output = m_op_a[0];
     m_intermidiate_res = m_op_a[0];
-    ICICLE_LOG_INFO << "point 1";
     for (uint64_t i = 1; i < m_nof_operations; ++i) {
-      // *m_output = *m_output + m_op_a[i];
+      // SP: *m_output = *m_output + m_op_a[i];
       m_intermidiate_res = m_intermidiate_res + m_op_a[i];
     }
   }
   // Single worker functionality to execute product(vector)
   void vector_product()
   {
-    *m_output = m_op_a[0];
+    // SP: *m_output = m_op_a[0];
+    m_intermidiate_res = m_op_a[0];
     for (uint64_t i = 1; i < m_nof_operations; ++i) {
-      *m_output = *m_output * m_op_a[i];
+      // SP: *m_output = *m_output * m_op_a[i];
+      m_intermidiate_res = m_intermidiate_res * m_op_a[i];
     }
   }
   // Single worker functionality to execute conversion from barret to montgomery
@@ -347,103 +345,66 @@ cpu_vector_div(const Device& device, const T* vec_a, const T* vec_b, uint64_t n,
 
 REGISTER_VECTOR_DIV_BACKEND("CPU", cpu_vector_div<scalar_t>);
 
-
-// #define SP_DEBUG
-
-#ifndef SP_DEBUG
-
 /*********************************** SUM ***********************************/
 template <typename T>
 eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output)
 {
-  ICICLE_LOG_INFO << "cpu_vector_sum";
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
   bool output_initialized = false;
-  uint64_t vec_s_offset = 0;
-  VectorOpTask<T>* task_p;
+  uint64_t vec_a_offset = 0;
   // run until all vector deployed and all tasks completed
-  do {
-    task_p = vec_s_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task();
+  while (true) {
+    VectorOpTask<T>* task_p  = vec_a_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task();
+    if (task_p == nullptr) {
+      return eIcicleError::SUCCESS;
+    }
     if (task_p->is_completed()) {
-      ICICLE_LOG_INFO << "task_p->m_intermidiate_res: " << task_p->m_intermidiate_res;
-      *output = output_initialized ? task_p->m_intermidiate_res : *output + task_p->m_intermidiate_res;
-      // SP: we used m_intermidiate_res, we have to mark it so we can't use it again. set_idle?
-      // SP: Use dispatch if setting a new task, or set_idle if to just mark the task result as handled.
-      // output_initialized = true;
-      // task_p->set_idle();
-      ICICLE_LOG_INFO << "after set_idle";
-      ICICLE_LOG_INFO << "is_completed: " << task_p->is_completed();
+      *output = output_initialized ? *output + task_p->m_intermidiate_res : task_p->m_intermidiate_res;
+      output_initialized = true;
     }
-    if (vec_s_offset < n) {
-      ICICLE_LOG_INFO << "vec_s_offset: " << vec_s_offset;
+    if (vec_a_offset < n) {
       task_p->send_intermidiate_res_task(
-        VecOperation::VECTOR_SUM, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_s_offset), vec_a + vec_s_offset);
-      vec_s_offset += NOF_OPERATIONS_PER_TASK;
+        VecOperation::VECTOR_SUM, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_a_offset), vec_a + vec_a_offset);
+      vec_a_offset += NOF_OPERATIONS_PER_TASK;
+    }
+    else {
+      task_p->set_idle();
     }
-    ICICLE_LOG_INFO << "task_p: " << task_p;
-  } while (task_p != nullptr);
-  // } while (vec_s_offset < n);
-  return eIcicleError::SUCCESS;
-}
-
-#else
-
-template <typename T>
-eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output)
-{
-  *output = scalar_t::zero();
-  for (uint64_t i = 0; i < n; ++i) {
-    *output = *output + vec_a[i];
   }
-  return eIcicleError::SUCCESS;
 }
 
-#endif
-
-
 REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum<scalar_t>);
-/*********************************** PRODUCT ***********************************/
-
-
-#ifndef SP_DEBUG
 
+/*********************************** PRODUCT ***********************************/
 template <typename T>
 eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output)
 {
   ICICLE_LOG_INFO << "cpu_vector_product";
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
   bool output_initialized = false;
-  uint64_t vec_s_offset = 0;
-  VectorOpTask<T>* task_p;
+  uint64_t vec_a_offset = 0;
+  
   // run until all vector deployed and all tasks completed
-  do {
-    task_p = vec_s_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task();
+  while (true) {
+    VectorOpTask<T>* task_p = vec_a_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task();
+    if (task_p == nullptr) {
+      return eIcicleError::SUCCESS;
+    }
     if (task_p->is_completed()) {
-      *output = output_initialized ? task_p->m_intermidiate_res : *output * task_p->m_intermidiate_res;
+      *output = output_initialized ? *output * task_p->m_intermidiate_res : task_p->m_intermidiate_res;
+      output_initialized = true;
     }
-    if (vec_s_offset < n) {
-      ICICLE_LOG_INFO << "vec_s_offset: " << vec_s_offset;
+    if (vec_a_offset < n) {
       task_p->send_intermidiate_res_task(
-        VecOperation::VECTOR_PRODUCT, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_s_offset), vec_a + vec_s_offset);
-      vec_s_offset += NOF_OPERATIONS_PER_TASK;
+        VecOperation::VECTOR_PRODUCT, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_a_offset), vec_a + vec_a_offset);
+      vec_a_offset += NOF_OPERATIONS_PER_TASK;
     }
-  } while (task_p != nullptr);
-  return eIcicleError::SUCCESS;
-}
-
-#else
-template <typename T>
-eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output)
-{
-  *output = scalar_t::one();
-  for (uint64_t i = 0; i < n; ++i) {
-    *output = *output * vec_a[i];
-  }
-  return eIcicleError::SUCCESS;
+    else {
+      task_p->set_idle();
+    }
+  } 
 }
 
-#endif
-
 REGISTER_VECTOR_PRODUCT_BACKEND("CPU", cpu_vector_product<scalar_t>);
 
 /*********************************** MUL BY SCALAR***********************************/

From 2ab44886e67154aa711b31cdbe433b1362d0483e Mon Sep 17 00:00:00 2001
From: Stas Polonsky <stas@ingonyama.com>
Date: Mon, 16 Sep 2024 20:29:22 +0000
Subject: [PATCH 05/43] added offset/stride to reduce ops

---
 examples/c++/vector-api/example.cpp             |  8 ++++----
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp    |  5 +++--
 icicle/include/icicle/backend/vec_ops_backend.h |  4 +++-
 icicle/include/icicle/vec_ops.h                 |  4 ++--
 icicle/src/vec_ops.cpp                          | 16 ++++++++--------
 5 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/examples/c++/vector-api/example.cpp b/examples/c++/vector-api/example.cpp
index 10ee787cb..16637d7e1 100644
--- a/examples/c++/vector-api/example.cpp
+++ b/examples/c++/vector-api/example.cpp
@@ -10,10 +10,10 @@
 // SP: I undertstand this code is auto-generated, but I can't get scrip/gen to work. 
 
 extern "C" eIcicleError bn254_vector_product(
-  const bn254::scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result);
+  const bn254::scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result, uint64_t offset, uint64_t  stride);
 
 extern "C" eIcicleError bn254_vector_sum(
-  const bn254::scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result);
+  const bn254::scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result, uint64_t offset, uint64_t  stride);
 
 // SP: end of my changes
 
@@ -104,7 +104,7 @@ int main(int argc, char** argv)
   END_TIMER(baseline_reduce_sum, "baseline reduce sum took");
 
   START_TIMER(reduce_sum);
-  ICICLE_CHECK(bn254_vector_sum(d_a, N, &h_config, d_out));
+  ICICLE_CHECK(bn254_vector_sum(d_a, N, &h_config, d_out, 0, 1));
   // ICICLE_CHECK(bn254_vector_sum(d_a, N, &d_config, d_out));
   END_TIMER(reduce_sum, "reduce sum took");
 
@@ -124,7 +124,7 @@ int main(int argc, char** argv)
 
   
   START_TIMER(reduce_product);
-  ICICLE_CHECK(bn254_vector_product(d_a, N, &d_config, d_out));
+  ICICLE_CHECK(bn254_vector_product(d_a, N, &d_config, d_out, 0, 1));
   END_TIMER(reduce_product, "reduce product took");
 
 
diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index a06c12a50..3dba93937 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -346,8 +346,9 @@ cpu_vector_div(const Device& device, const T* vec_a, const T* vec_b, uint64_t n,
 REGISTER_VECTOR_DIV_BACKEND("CPU", cpu_vector_div<scalar_t>);
 
 /*********************************** SUM ***********************************/
+
 template <typename T>
-eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output)
+eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output, uint64_t offset, uint64_t  stride)
 {
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
   bool output_initialized = false;
@@ -377,7 +378,7 @@ REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum<scalar_t>);
 
 /*********************************** PRODUCT ***********************************/
 template <typename T>
-eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output)
+eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output, uint64_t offset, uint64_t  stride)
 {
   ICICLE_LOG_INFO << "cpu_vector_product";
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
diff --git a/icicle/include/icicle/backend/vec_ops_backend.h b/icicle/include/icicle/backend/vec_ops_backend.h
index 3914e750a..3ce9271e7 100644
--- a/icicle/include/icicle/backend/vec_ops_backend.h
+++ b/icicle/include/icicle/backend/vec_ops_backend.h
@@ -12,7 +12,9 @@ namespace icicle {
     const scalar_t* vec_a,
     uint64_t n,
     const VecOpsConfig& config,
-    scalar_t* output)>;
+    scalar_t* output,
+    uint64_t offset, 
+    uint64_t  stride)>;
 
 
 
diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h
index 322ed0c81..e0cf6f7af 100644
--- a/icicle/include/icicle/vec_ops.h
+++ b/icicle/include/icicle/vec_ops.h
@@ -60,7 +60,7 @@ namespace icicle {
    */
 
   template <typename T>
-  eIcicleError vector_product(const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output);
+  eIcicleError vector_product(const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output, uint64_t offset, uint64_t  stride);
 
   /**
    * @brief Computes the sum of all elements in a vector.
@@ -74,7 +74,7 @@ namespace icicle {
    */
 
   template <typename T>
-  eIcicleError vector_sum(const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output);
+  eIcicleError vector_sum(const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output, uint64_t offset, uint64_t  stride);
 
 
   // Element-wise vector operations
diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp
index e0acd0091..29ab25ba0 100644
--- a/icicle/src/vec_ops.cpp
+++ b/icicle/src/vec_ops.cpp
@@ -8,32 +8,32 @@ namespace icicle {
   ICICLE_DISPATCHER_INST(VectorProductDispatcher, vector_product, scalarVectorReduceOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_product)(
-    const scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, scalar_t* output, uint64_t offset, uint64_t  stride)
   {
-    return VectorProductDispatcher::execute(vec_a, n, *config, output);
+    return VectorProductDispatcher::execute(vec_a, n, *config, output, offset, stride);
   }
 
   template <>
   eIcicleError
-  vector_product(const scalar_t* vec_a, uint64_t n, const VecOpsConfig& config, scalar_t* output)
+  vector_product(const scalar_t* vec_a, uint64_t n, const VecOpsConfig& config, scalar_t* output, uint64_t offset, uint64_t  stride)
   {
-    return CONCAT_EXPAND(FIELD, vector_product)(vec_a, n, &config, output);
+    return CONCAT_EXPAND(FIELD, vector_product)(vec_a, n, &config, output, offset, stride);
   }
 
   /*********************************** REDUCE SUM ****************************/
   ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, scalarVectorReduceOpImpl );
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_sum)(
-    const scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, scalar_t* output, uint64_t offset=0, uint64_t  stride=1)
   {
-    return VectorSumDispatcher::execute(vec_a, n, *config, output);
+    return VectorSumDispatcher::execute(vec_a, n, *config, output, offset, stride);
   }
 
   template <>
   eIcicleError
-  vector_sum(const scalar_t* vec_a, uint64_t n, const VecOpsConfig& config, scalar_t* output)
+  vector_sum(const scalar_t* vec_a, uint64_t n, const VecOpsConfig& config, scalar_t* output, uint64_t offset, uint64_t  stride)
   {
-    return CONCAT_EXPAND(FIELD, vector_sum)(vec_a, n, &config, output);
+    return CONCAT_EXPAND(FIELD, vector_sum)(vec_a, n, &config, output, offset, stride);
   }
 
   /*********************************** ADD ***********************************/

From 89e998ae2c7426e1843a6a9cf3f79f961c6cd4a6 Mon Sep 17 00:00:00 2001
From: Stas Polonsky <stas@ingonyama.com>
Date: Tue, 17 Sep 2024 21:42:42 +0000
Subject: [PATCH 06/43] implemented strides ops

---
 examples/c++/vector-api/example.cpp          | 15 ++++----
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 36 ++++++++++----------
 2 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/examples/c++/vector-api/example.cpp b/examples/c++/vector-api/example.cpp
index 16637d7e1..2a998c5c7 100644
--- a/examples/c++/vector-api/example.cpp
+++ b/examples/c++/vector-api/example.cpp
@@ -30,7 +30,7 @@ void random_samples(scalar_t* res, uint32_t count)
 void incremental_values(scalar_t* res, uint32_t count)
 {
   for (int i = 0; i < count; i++) {
-    res[i] = i ? res[i - 1] + scalar_t::one() : scalar_t::zero();
+    res[i] = i ? res[i - 1] + scalar_t::one() : scalar_t::one();
   }
 }
 
@@ -45,6 +45,8 @@ int main(int argc, char** argv)
 
   int N_LOG = 20;
   int N = 1 << N_LOG;
+  int offset = 1;
+  int stride = 4;
 
   // on-host data
   auto h_a = std::make_unique<scalar_t[]>(N);
@@ -98,14 +100,13 @@ int main(int argc, char** argv)
 
   START_TIMER(baseline_reduce_sum);  
   h_out[0] = scalar_t::zero();
-  for (uint64_t i = 0; i < N; ++i) {
+  for (uint64_t i = offset; i < N; i=i+stride) {
     h_out[0] = h_out[0] + h_a[i];
   }
   END_TIMER(baseline_reduce_sum, "baseline reduce sum took");
 
   START_TIMER(reduce_sum);
-  ICICLE_CHECK(bn254_vector_sum(d_a, N, &h_config, d_out, 0, 1));
-  // ICICLE_CHECK(bn254_vector_sum(d_a, N, &d_config, d_out));
+  ICICLE_CHECK(bn254_vector_sum(d_a, N, &h_config, d_out, offset, stride));
   END_TIMER(reduce_sum, "reduce sum took");
 
 
@@ -113,18 +114,16 @@ int main(int argc, char** argv)
   std::cout << "d_out: " << d_out[0] << std::endl;
 
 
-  // return 0;
-
   START_TIMER(baseline_reduce_product);  
   h_out[0] = scalar_t::one();
-  for (uint64_t i = 0; i < N; ++i) {
+  for (uint64_t i = offset; i < N; i = i + stride) {
     h_out[0] = h_out[0] * h_a[i];
   }
   END_TIMER(baseline_reduce_product, "baseline reduce product took");
 
   
   START_TIMER(reduce_product);
-  ICICLE_CHECK(bn254_vector_product(d_a, N, &d_config, d_out, 0, 1));
+  ICICLE_CHECK(bn254_vector_product(d_a, N, &d_config, d_out, offset, stride));
   END_TIMER(reduce_product, "reduce product took");
 
 
diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index 3dba93937..48feb49ca 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -66,12 +66,12 @@ class VectorOpTask : public TaskBase
     dispatch();
   }
   // Set the operands to execute a task of 1 operand and dispatch the task
-  void send_intermidiate_res_task(VecOperation operation, const int nof_operations, const T* op_a)
+  void send_intermidiate_res_task(VecOperation operation, const uint64_t stop_index, const T* op_a, const uint64_t stride)
   {
     m_operation = operation;
-    m_nof_operations = nof_operations;
+    m_stop_index = stop_index;
     m_op_a = op_a;
-    // SP: where is m_output?
+    m_stride = stride;
     dispatch();
   }
 
@@ -155,20 +155,16 @@ class VectorOpTask : public TaskBase
   // Single worker functionality to execute sum(vector)
   void vector_sum()
   {
-    // SP: *m_output = m_op_a[0];
-    m_intermidiate_res = m_op_a[0];
-    for (uint64_t i = 1; i < m_nof_operations; ++i) {
-      // SP: *m_output = *m_output + m_op_a[i];
+    m_intermidiate_res = T::zero();
+    for (uint64_t i = 0; i < m_stop_index; i = i + m_stride) {
       m_intermidiate_res = m_intermidiate_res + m_op_a[i];
     }
   }
   // Single worker functionality to execute product(vector)
   void vector_product()
   {
-    // SP: *m_output = m_op_a[0];
-    m_intermidiate_res = m_op_a[0];
-    for (uint64_t i = 1; i < m_nof_operations; ++i) {
-      // SP: *m_output = *m_output * m_op_a[i];
+    m_intermidiate_res = T::one();
+    for (uint64_t i = 0; i < m_stop_index; i = i + m_stride) {
       m_intermidiate_res = m_intermidiate_res * m_op_a[i];
     }
   }
@@ -244,6 +240,7 @@ class VectorOpTask : public TaskBase
   const T* m_op_a;          // pointer to operand A. Operand A is a vector.
   const T* m_op_b;          // pointer to operand B. Operand B is a vector or scalar
   uint64_t m_start_index;   // index used in bitreverse
+  uint64_t m_stop_index;    // index used in reduce operations
   int m_bit_size;           // use in bitrev operation
   uint64_t m_stride;        // used in slice operation
   T* m_output;              // pointer to the output. Can be a vector or scalar pointer
@@ -352,7 +349,9 @@ eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, co
 {
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
   bool output_initialized = false;
-  uint64_t vec_a_offset = 0;
+  uint64_t vec_a_offset = offset;
+  assert(stride > 0);
+  const uint64_t slice_length = stride * NOF_OPERATIONS_PER_TASK;
   // run until all vector deployed and all tasks completed
   while (true) {
     VectorOpTask<T>* task_p  = vec_a_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task();
@@ -365,8 +364,8 @@ eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, co
     }
     if (vec_a_offset < n) {
       task_p->send_intermidiate_res_task(
-        VecOperation::VECTOR_SUM, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_a_offset), vec_a + vec_a_offset);
-      vec_a_offset += NOF_OPERATIONS_PER_TASK;
+        VecOperation::VECTOR_SUM, std::min( slice_length , n - vec_a_offset), vec_a + vec_a_offset, stride);
+      vec_a_offset += slice_length;
     }
     else {
       task_p->set_idle();
@@ -380,10 +379,11 @@ REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum<scalar_t>);
 template <typename T>
 eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output, uint64_t offset, uint64_t  stride)
 {
-  ICICLE_LOG_INFO << "cpu_vector_product";
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
   bool output_initialized = false;
-  uint64_t vec_a_offset = 0;
+  uint64_t vec_a_offset = offset;
+  assert(stride > 0);
+  const uint64_t slice_length = stride * NOF_OPERATIONS_PER_TASK;
   
   // run until all vector deployed and all tasks completed
   while (true) {
@@ -397,8 +397,8 @@ eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n
     }
     if (vec_a_offset < n) {
       task_p->send_intermidiate_res_task(
-        VecOperation::VECTOR_PRODUCT, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_a_offset), vec_a + vec_a_offset);
-      vec_a_offset += NOF_OPERATIONS_PER_TASK;
+        VecOperation::VECTOR_PRODUCT, std::min(slice_length, n - vec_a_offset), vec_a + vec_a_offset, stride);
+      vec_a_offset += slice_length;
     }
     else {
       task_p->set_idle();

From 9aaf944ff5286014a5eb747f4dc8f61de3d26b02 Mon Sep 17 00:00:00 2001
From: Shanie Winitz <shanie@ingonyama.com>
Date: Wed, 9 Oct 2024 09:14:50 +0300
Subject: [PATCH 07/43] vec_ops batch added

---
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp  | 609 ++++++++++++------
 .../include/icicle/backend/vec_ops_backend.h  |  45 +-
 .../default_backend/default_poly_backend.h    |  12 +-
 icicle/include/icicle/vec_ops.h               | 328 ++++++----
 icicle/src/vec_ops.cpp                        | 164 ++---
 5 files changed, 752 insertions(+), 406 deletions(-)

diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index 48feb49ca..952f5108f 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -6,6 +6,9 @@
 
 #include "icicle/fields/field_config.h"
 #include "tasks_manager.h"
+#include <cstdint>
+#include <sys/types.h>
+#include <vector>
 
 using namespace field_config;
 using namespace icicle;
@@ -18,14 +21,15 @@ enum VecOperation {
   VECTOR_MUL,
   VECTOR_DIV,
   VECTOR_SUM,
+  CONVERT_TO_MONTGOMERY,
+  CONVERT_FROM_MONTGOMERY,
   VECTOR_PRODUCT,
   SCALAR_ADD_VEC,
   SCALAR_SUB_VEC,
   SCALAR_MUL_VEC,
-  CONVERT_TO_MONTGOMERY,
-  CONVERT_FROM_MONTGOMERY,
   BIT_REVERSE,
   SLICE,
+  REPLACE_ELEMENTS, 
 
   NOF_OPERATIONS
 };
@@ -46,18 +50,19 @@ class VectorOpTask : public TaskBase
   VectorOpTask() : TaskBase() {}
 
   // Set the operands to execute a task of 2 operands and 1 output and dispatch the task
-  void send_2ops_task(VecOperation operation, const int nof_operations, const T* op_a, const T* op_b, T* output)
+  void send_2ops_task(VecOperation operation, const uint32_t nof_operations, const T* op_a, const T* op_b, const uint32_t stride , T* output)
   {
     m_operation = operation;
     m_nof_operations = nof_operations;
     m_op_a = op_a;
     m_op_b = op_b;
+    m_stride = stride;
     m_output = output;
     dispatch();
   }
 
   // Set the operands to execute a task of 1 operand and 1 output and dispatch the task
-  void send_1op_task(VecOperation operation, const int nof_operations, const T* op_a, T* output)
+  void send_1op_task(VecOperation operation, const uint32_t nof_operations, const T* op_a, T* output)
   {
     m_operation = operation;
     m_nof_operations = nof_operations;
@@ -75,29 +80,48 @@ class VectorOpTask : public TaskBase
     dispatch();
   }
 
-  // Set the operands to bitrev operation dispatch the task
+  // Set the operands for bitrev operation and dispatch the task
   void send_bitrev_task(
-    VecOperation operation, int bit_size, uint64_t start_index, const int nof_operations, const T* op_a, T* output)
+    VecOperation operation, uint32_t bit_size, uint64_t start_index, const uint32_t nof_operations, const T* op_a, const uint64_t stride, T* output)
   {
     m_operation = operation;
+    m_bit_size = bit_size;
+    m_start_index = start_index;
     m_nof_operations = nof_operations;
     m_op_a = op_a;
+    m_stride = stride;
     m_output = output;
-    m_bit_size = bit_size, m_start_index = start_index;
     dispatch();
   }
 
-  // Set the operands to slice operation dispatch the task
-  void send_slice_task(VecOperation operation, uint64_t stride, const int nof_operations, const T* op_a, T* output)
+  // Set the operands for slice operation and dispatch the task
+  void send_slice_task(VecOperation operation, uint64_t stride, uint64_t stride_out, const uint32_t nof_operations, const T* op_a, T* output)
   {
     m_operation = operation;
     m_nof_operations = nof_operations;
     m_op_a = op_a;
     m_output = output;
     m_stride = stride;
+    m_stride_out = stride_out;
+    dispatch();
+  }
+
+  // Set the operands for replace_elements operation and dispatch the task
+  void send_replace_elements_task(VecOperation operation, const T* mat_in, const uint32_t nof_operations, std::vector<uint64_t>& start_indices_in_mat, uint64_t start_index, uint32_t log_nof_rows, uint32_t log_nof_cols, const uint32_t stride, T* mat_out)
+  {
+    m_operation = operation;
+    m_op_a = mat_in;
+    m_nof_operations = nof_operations;
+    m_start_indices_in_mat = &start_indices_in_mat;
+    m_start_index = start_index; //start index in start_indices vector
+    m_log_nof_rows = log_nof_rows;
+    m_log_nof_cols = log_nof_cols;
+    m_stride = stride;
+    m_output = mat_out;
     dispatch();
   }
 
+
   // Execute the selected function based on m_operation
   virtual void execute() { (this->*functionPtrs[static_cast<size_t>(m_operation)])(); }
 
@@ -131,56 +155,55 @@ class VectorOpTask : public TaskBase
       m_output[i] = m_op_a[i] * T::inverse(m_op_b[i]);
     }
   }
-  // Single worker functionality to execute scalar + vector
-  void scalar_add_vec()
-  {
-    for (uint64_t i = 0; i < m_nof_operations; ++i) {
-      m_output[i] = *m_op_a + m_op_b[i];
-    }
-  }
-  // Single worker functionality to execute scalar - vector
-  void scalar_sub_vec()
+  // Single worker functionality to execute conversion from barret to montgomery
+  void convert_to_montgomery()
   {
     for (uint64_t i = 0; i < m_nof_operations; ++i) {
-      m_output[i] = *m_op_a + m_op_b[i];
+      m_output[i] = T::to_montgomery(m_op_a[i]);
     }
   }
-  // Single worker functionality to execute scalar * vector
-  void scalar_mul_vec()
+  // Single worker functionality to execute conversion from montgomery to barret
+  void convert_from_montgomery()
   {
     for (uint64_t i = 0; i < m_nof_operations; ++i) {
-      m_output[i] = *m_op_a * m_op_b[i];
+      m_output[i] = T::from_montgomery(m_op_a[i]);
     }
   }
   // Single worker functionality to execute sum(vector)
   void vector_sum()
   {
-    m_intermidiate_res = T::zero();
-    for (uint64_t i = 0; i < m_stop_index; i = i + m_stride) {
-      m_intermidiate_res = m_intermidiate_res + m_op_a[i];
+    m_intermidiate_res[m_idx_in_batch] = T::zero();
+    for (uint64_t i = 0; i < (m_stop_index * m_stride); i = i + m_stride) {
+      m_intermidiate_res[m_idx_in_batch] = m_intermidiate_res[m_idx_in_batch] + m_op_a[i];
     }
   }
   // Single worker functionality to execute product(vector)
   void vector_product()
   {
-    m_intermidiate_res = T::one();
-    for (uint64_t i = 0; i < m_stop_index; i = i + m_stride) {
-      m_intermidiate_res = m_intermidiate_res * m_op_a[i];
+    m_intermidiate_res[m_idx_in_batch] = T::one();
+    for (uint64_t i = 0; i < (m_stop_index * m_stride); i = i + m_stride) {
+      m_intermidiate_res[m_idx_in_batch] = m_intermidiate_res[m_idx_in_batch] * m_op_a[i];
     }
   }
-  // Single worker functionality to execute conversion from barret to montgomery
-  void convert_to_montgomery()
+  // Single worker functionality to execute scalar + vector
+  void scalar_add_vec()
   {
     for (uint64_t i = 0; i < m_nof_operations; ++i) {
-      m_output[i] = T::to_montgomery(m_op_a[i]);
+      m_output[m_stride * i] = *m_op_a + m_op_b[m_stride * i];
     }
   }
-
-  // Single worker functionality to execute conversion from montgomery to barret
-  void convert_from_montgomery()
+  // Single worker functionality to execute scalar - vector
+  void scalar_sub_vec()
   {
     for (uint64_t i = 0; i < m_nof_operations; ++i) {
-      m_output[i] = T::from_montgomery(m_op_a[i]);
+      m_output[m_stride * i] = *m_op_a - m_op_b[m_stride * i];
+    }
+  }
+  // Single worker functionality to execute scalar * vector
+  void scalar_mul_vec()
+  {
+    for (uint64_t i = 0; i < m_nof_operations; ++i) {
+      m_output[m_stride * i] = *m_op_a * m_op_b[m_stride * i];
     }
   }
   // Single worker functionality to execute bit reverse reorder
@@ -201,10 +224,10 @@ class VectorOpTask : public TaskBase
 
       if (m_output == m_op_a) { // inplace calculation
         if (rev_idx < idx) {    // only on of the threads need to work
-          std::swap(m_output[idx], m_output[rev_idx]);
+          std::swap(m_output[m_stride*idx], m_output[m_stride*rev_idx]);
         }
       } else {                           // out of place calculation
-        m_output[idx] = m_op_a[rev_idx]; // set index value
+        m_output[m_stride*idx] = m_op_a[m_stride*rev_idx]; // set index value
       }
     }
   }
@@ -213,10 +236,40 @@ class VectorOpTask : public TaskBase
   void slice()
   {
     for (uint64_t i = 0; i < m_nof_operations; ++i) {
-      m_output[i] = m_op_a[i * m_stride];
+      m_output[i * m_stride_out] = m_op_a[i * m_stride];
     }
   }
 
+  // Function to perform modulus with Mersenne number
+  uint64_t mersenne_mod(uint64_t shifted_idx, uint32_t total_bits) {
+    uint64_t mod = (1ULL << total_bits) - 1;
+    shifted_idx = (shifted_idx & mod) + (shifted_idx >> total_bits);
+    while (shifted_idx >= mod) {
+      shifted_idx = (shifted_idx & mod) + (shifted_idx >> total_bits);
+    }
+    // If shifted_idx == mod, result should be 0 since mod % mod == 0
+    if (shifted_idx == mod) shifted_idx = 0; //TODO SHANIE - check if redundant
+    return shifted_idx;
+  }
+
+
+  // Single worker functionality to execute replace elements
+  void replace_elements()
+  {
+    const uint32_t total_bits = m_log_nof_rows + m_log_nof_cols;
+    for (uint32_t i = 0; i < m_nof_operations; ++i) {
+      uint64_t start_idx = (*m_start_indices_in_mat)[m_start_index + i];
+      uint64_t idx = start_idx;
+      do {
+        uint64_t shifted_idx = idx << m_log_nof_rows;
+        uint64_t new_idx = mersenne_mod(shifted_idx, total_bits);
+        m_output[m_stride * new_idx] = m_op_a[m_stride * idx];
+        idx = new_idx;
+      } while (idx != start_idx);
+    }
+  }
+
+
   // An array of available function pointers arranged according to the VecOperation enum
   using FunctionPtr = void (VectorOpTask::*)();
   static constexpr std::array<FunctionPtr, static_cast<int>(NOF_OPERATIONS)> functionPtrs = {
@@ -224,29 +277,36 @@ class VectorOpTask : public TaskBase
     &VectorOpTask::vector_sub,              // VECTOR_SUB,
     &VectorOpTask::vector_mul,              // VECTOR_MUL,
     &VectorOpTask::vector_div,              // VECTOR_DIV,
+    &VectorOpTask::convert_to_montgomery,   // CONVERT_TO_MONTGOMERY,
+    &VectorOpTask::convert_from_montgomery, // CONVERT_FROM_MONTGOMERY,
     &VectorOpTask::vector_sum,              // VECTOR_SUM
     &VectorOpTask::vector_product,          // VECTOR_PRODUCT
     &VectorOpTask::scalar_add_vec,          // SCALAR_ADD_VEC,
     &VectorOpTask::scalar_sub_vec,          // SCALAR_SUB_VEC,
     &VectorOpTask::scalar_mul_vec,          // SCALAR_MUL_VEC,
-    &VectorOpTask::convert_to_montgomery,   // CONVERT_TO_MONTGOMERY,
-    &VectorOpTask::convert_from_montgomery, // CONVERT_FROM_MONTGOMERY,
     &VectorOpTask::bit_reverse,             // BIT_REVERSE
-    &VectorOpTask::slice                    // SLICE
+    &VectorOpTask::slice,                   // SLICE
+    &VectorOpTask::replace_elements         // REPLACE_ELEMENTS
   };
 
   VecOperation m_operation; // the operation to execute
-  int m_nof_operations;     // number of operations to execute for this task
-  const T* m_op_a;          // pointer to operand A. Operand A is a vector.
+  uint32_t m_nof_operations;     // number of operations to execute for this task
+  const T* m_op_a;          // pointer to operand A. Operand A is a vector, or metrix in case of replace_elements
   const T* m_op_b;          // pointer to operand B. Operand B is a vector or scalar
-  uint64_t m_start_index;   // index used in bitreverse
+  uint64_t m_start_index;   // index used in bitreverse operation
   uint64_t m_stop_index;    // index used in reduce operations
-  int m_bit_size;           // use in bitrev operation
-  uint64_t m_stride;        // used in slice operation
-  T* m_output;              // pointer to the output. Can be a vector or scalar pointer
+  uint32_t m_bit_size;      // use in bitrev operation
+  uint64_t m_stride;        // used to support column batch operations
+  uint64_t m_stride_out;    // used in slice operation
+  T* m_output;              // pointer to the output. Can be a vector, scalar pointer, or a matrix pointer in case of replace_elements
+  uint32_t m_log_nof_rows;  // log of the number of rows in the matrix, used in replace_elements
+  uint32_t m_log_nof_cols;  // log of the number of columns in the matrix, used in replace_elements
+  const std::vector<uint64_t>* m_start_indices_in_mat; // Indices used in replace_elements operations
+
 public:  
-  T m_intermidiate_res;     // pointer to the output. Can be a vector or scalar pointer
-};
+  T* m_intermidiate_res;     // pointer to the output. Can be a vector or scalar pointer
+  uint64_t m_idx_in_batch;    // index in the batch. Used in intermidiate res tasks
+}; // class VectorOpTask
 
 #define NOF_OPERATIONS_PER_TASK 512
 #define CONFIG_NOF_THREADS_KEY  "n_threads"
@@ -263,12 +323,13 @@ int get_nof_workers(const VecOpsConfig& config)
 // Execute a full task from the type vector = vector (op) vector
 template <typename T>
 eIcicleError
-cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output)
+cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
-  for (uint64_t i = 0; i < n; i += NOF_OPERATIONS_PER_TASK) {
+  const uint64_t total_nof_operations = size*config.batch_size;
+  for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) {
     VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
-    task_p->send_2ops_task(op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - i), vec_a + i, vec_b + i, output + i);
+    task_p->send_2ops_task(op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), vec_a + i, vec_b + i, 1, output + i);
   }
   task_manager.wait_done();
   return eIcicleError::SUCCESS;
@@ -277,12 +338,22 @@ cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t n, con
 // Execute a full task from the type vector = scalar (op) vector
 template <typename T>
 eIcicleError cpu_scalar_vector_op(
-  VecOperation op, const T* scalar_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output)
+  VecOperation op, const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output)
 {
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
-  for (uint64_t i = 0; i < n; i += NOF_OPERATIONS_PER_TASK) {
-    VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
-    task_p->send_2ops_task(op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - i), scalar_a, vec_b + i, output + i);
+  const uint64_t total_nof_operations = use_single_scalar? size*config.batch_size : size;
+  const uint32_t stride = (!use_single_scalar && config.columns_batch)? config.batch_size : 1;
+  for (uint32_t idx_in_batch = 0; idx_in_batch < (use_single_scalar? 1 : config.batch_size); idx_in_batch++) {
+    for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) {
+      VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
+      task_p->send_2ops_task(
+      op,
+      std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i),
+      scalar_a + idx_in_batch,
+      (!use_single_scalar && config.columns_batch)? vec_b + idx_in_batch + i*config.batch_size : vec_b + idx_in_batch*size + i,
+      stride,
+      (!use_single_scalar && config.columns_batch)? output + idx_in_batch + i*config.batch_size : output + idx_in_batch*size + i);
+    }
   }
   task_manager.wait_done();
   return eIcicleError::SUCCESS;
@@ -290,11 +361,12 @@ eIcicleError cpu_scalar_vector_op(
 
 ///////////////////////////////////////////////////////
 // Functions to register at the CPU backend
+/*********************************** ADD ***********************************/
 template <typename T>
 eIcicleError
-cpu_vector_add(const Device& device, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output)
+cpu_vector_add(const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  return cpu_2vectors_op(VecOperation::VECTOR_ADD, vec_a, vec_b, n, config, output);
+  return cpu_2vectors_op(VecOperation::VECTOR_ADD, vec_a, vec_b, size, config, output);
 }
 
 REGISTER_VECTOR_ADD_BACKEND("CPU", cpu_vector_add<scalar_t>);
@@ -302,12 +374,9 @@ REGISTER_VECTOR_ADD_BACKEND("CPU", cpu_vector_add<scalar_t>);
 /*********************************** ACCUMULATE ***********************************/
 template <typename T>
 eIcicleError
-cpu_vector_accumulate(const Device& device, T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config)
+cpu_vector_accumulate(const Device& device, T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config)
 {
-  for (uint64_t i = 0; i < n; ++i) {
-    vec_a[i] = vec_a[i] + vec_b[i];
-  }
-  return eIcicleError::SUCCESS;
+  return cpu_2vectors_op(VecOperation::VECTOR_ADD, vec_a, vec_b, size, config, vec_a);
 }
 
 REGISTER_VECTOR_ACCUMULATE_BACKEND("CPU", cpu_vector_accumulate<scalar_t>);
@@ -315,9 +384,9 @@ REGISTER_VECTOR_ACCUMULATE_BACKEND("CPU", cpu_vector_accumulate<scalar_t>);
 /*********************************** SUB ***********************************/
 template <typename T>
 eIcicleError
-cpu_vector_sub(const Device& device, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output)
+cpu_vector_sub(const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  return cpu_2vectors_op(VecOperation::VECTOR_SUB, vec_a, vec_b, n, config, output);
+  return cpu_2vectors_op(VecOperation::VECTOR_SUB, vec_a, vec_b, size, config, output);
 }
 
 REGISTER_VECTOR_SUB_BACKEND("CPU", cpu_vector_sub<scalar_t>);
@@ -325,9 +394,9 @@ REGISTER_VECTOR_SUB_BACKEND("CPU", cpu_vector_sub<scalar_t>);
 /*********************************** MUL ***********************************/
 template <typename T>
 eIcicleError
-cpu_vector_mul(const Device& device, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output)
+cpu_vector_mul(const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  return cpu_2vectors_op(VecOperation::VECTOR_MUL, vec_a, vec_b, n, config, output);
+  return cpu_2vectors_op(VecOperation::VECTOR_MUL, vec_a, vec_b, size, config, output);
 }
 
 REGISTER_VECTOR_MUL_BACKEND("CPU", cpu_vector_mul<scalar_t>);
@@ -335,37 +404,71 @@ REGISTER_VECTOR_MUL_BACKEND("CPU", cpu_vector_mul<scalar_t>);
 /*********************************** DIV ***********************************/
 template <typename T>
 eIcicleError
-cpu_vector_div(const Device& device, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output)
+cpu_vector_div(const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  return cpu_2vectors_op(VecOperation::VECTOR_DIV, vec_a, vec_b, n, config, output);
+  return cpu_2vectors_op(VecOperation::VECTOR_DIV, vec_a, vec_b, size, config, output);
 }
 
 REGISTER_VECTOR_DIV_BACKEND("CPU", cpu_vector_div<scalar_t>);
 
+/*********************************** CONVERT MONTGOMERY ***********************************/
+template <typename T>
+eIcicleError cpu_convert_montgomery(
+  const Device& device, const T* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, T* output)
+{
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
+  const uint64_t total_nof_operations = size*config.batch_size;
+  for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) {
+    VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
+    task_p->send_1op_task(
+      is_to_montgomery ? CONVERT_TO_MONTGOMERY : CONVERT_FROM_MONTGOMERY, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i),
+      input + i, output + i);
+  }
+  task_manager.wait_done();
+  return eIcicleError::SUCCESS;
+}
+
+REGISTER_CONVERT_MONTGOMERY_BACKEND("CPU", cpu_convert_montgomery<scalar_t>);
+
+#ifdef EXT_FIELD
+REGISTER_VECTOR_ADD_EXT_FIELD_BACKEND("CPU", cpu_vector_add<extension_t>);
+REGISTER_VECTOR_ACCUMULATE_EXT_FIELD_BACKEND("CPU", cpu_vector_accumulate<extension_t>);
+REGISTER_VECTOR_SUB_EXT_FIELD_BACKEND("CPU", cpu_vector_sub<extension_t>);
+REGISTER_VECTOR_MUL_EXT_FIELD_BACKEND("CPU", cpu_vector_mul<extension_t>);
+REGISTER_CONVERT_MONTGOMERY_EXT_FIELD_BACKEND("CPU", cpu_convert_montgomery<extension_t>);
+#endif // EXT_FIELD
+
 /*********************************** SUM ***********************************/
 
 template <typename T>
-eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output, uint64_t offset, uint64_t  stride)
+eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output)
 {
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
-  bool output_initialized = false;
-  uint64_t vec_a_offset = offset;
-  assert(stride > 0);
-  const uint64_t slice_length = stride * NOF_OPERATIONS_PER_TASK;
+  std::vector<bool> output_initialized = std::vector<bool>(config.batch_size, false);
+  uint64_t vec_a_offset = 0;
+  uint64_t idx_in_batch = 0;
   // run until all vector deployed and all tasks completed
   while (true) {
-    VectorOpTask<T>* task_p  = vec_a_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task();
+    VectorOpTask<T>* task_p  = vec_a_offset < size ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task();
     if (task_p == nullptr) {
       return eIcicleError::SUCCESS;
     }
     if (task_p->is_completed()) {
-      *output = output_initialized ? *output + task_p->m_intermidiate_res : task_p->m_intermidiate_res;
-      output_initialized = true;
+      output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] ? output[task_p->m_idx_in_batch] + task_p->m_intermidiate_res[task_p->m_idx_in_batch] : task_p->m_intermidiate_res[task_p->m_idx_in_batch];
+      output_initialized[task_p->m_idx_in_batch] = true;
     }
-    if (vec_a_offset < n) {
+    if (vec_a_offset < size) {
+      task_p->m_idx_in_batch = idx_in_batch;
       task_p->send_intermidiate_res_task(
-        VecOperation::VECTOR_SUM, std::min( slice_length , n - vec_a_offset), vec_a + vec_a_offset, stride);
-      vec_a_offset += slice_length;
+        VecOperation::VECTOR_SUM,
+        std::min((uint64_t)NOF_OPERATIONS_PER_TASK , size - vec_a_offset),
+        config.columns_batch? vec_a + idx_in_batch + vec_a_offset*config.batch_size : vec_a + idx_in_batch*size + vec_a_offset,
+        config.columns_batch? config.batch_size : 1);
+      idx_in_batch++;
+      if (idx_in_batch == config.batch_size) {
+        vec_a_offset += NOF_OPERATIONS_PER_TASK;
+        idx_in_batch = 0;
+      }
     }
     else {
       task_p->set_idle();
@@ -377,53 +480,49 @@ REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum<scalar_t>);
 
 /*********************************** PRODUCT ***********************************/
 template <typename T>
-eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output, uint64_t offset, uint64_t  stride)
+eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output)
 {
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
-  bool output_initialized = false;
-  uint64_t vec_a_offset = offset;
-  assert(stride > 0);
-  const uint64_t slice_length = stride * NOF_OPERATIONS_PER_TASK;
-  
+  std::vector<bool> output_initialized = std::vector<bool>(config.batch_size, false);
+  uint64_t vec_a_offset = 0;
+  uint64_t idx_in_batch = 0;
   // run until all vector deployed and all tasks completed
   while (true) {
-    VectorOpTask<T>* task_p = vec_a_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task();
+    VectorOpTask<T>* task_p  = vec_a_offset < size ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task();
     if (task_p == nullptr) {
       return eIcicleError::SUCCESS;
     }
     if (task_p->is_completed()) {
-      *output = output_initialized ? *output * task_p->m_intermidiate_res : task_p->m_intermidiate_res;
-      output_initialized = true;
+      output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] ? output[task_p->m_idx_in_batch] + task_p->m_intermidiate_res[task_p->m_idx_in_batch] : task_p->m_intermidiate_res[task_p->m_idx_in_batch];
+      output_initialized[task_p->m_idx_in_batch] = true;
     }
-    if (vec_a_offset < n) {
+    if (vec_a_offset < size) {
+      task_p->m_idx_in_batch = idx_in_batch;
       task_p->send_intermidiate_res_task(
-        VecOperation::VECTOR_PRODUCT, std::min(slice_length, n - vec_a_offset), vec_a + vec_a_offset, stride);
-      vec_a_offset += slice_length;
+        VecOperation::VECTOR_PRODUCT,
+        std::min((uint64_t)NOF_OPERATIONS_PER_TASK , size - vec_a_offset),
+        config.columns_batch? vec_a + idx_in_batch + vec_a_offset*config.batch_size : vec_a + idx_in_batch*size + vec_a_offset,
+        config.columns_batch? config.batch_size : 1);
+      idx_in_batch++;
+      if (idx_in_batch == config.batch_size) {
+        vec_a_offset += NOF_OPERATIONS_PER_TASK;
+        idx_in_batch = 0;
+      }
     }
     else {
       task_p->set_idle();
     }
-  } 
+  }
 }
 
 REGISTER_VECTOR_PRODUCT_BACKEND("CPU", cpu_vector_product<scalar_t>);
 
-/*********************************** MUL BY SCALAR***********************************/
-template <typename T>
-eIcicleError cpu_scalar_mul(
-  const Device& device, const T* scalar_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output)
-{
-  return cpu_scalar_vector_op(VecOperation::SCALAR_MUL_VEC, scalar_a, vec_b, n, config, output);
-}
-
-REGISTER_SCALAR_MUL_VEC_BACKEND("CPU", cpu_scalar_mul<scalar_t>);
-
 /*********************************** Scalar + Vector***********************************/
 template <typename T>
 eIcicleError cpu_scalar_add(
-  const Device& device, const T* scalar_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output)
+  const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output)
 {
-  return cpu_scalar_vector_op(VecOperation::SCALAR_ADD_VEC, scalar_a, vec_b, n, config, output);
+  return cpu_scalar_vector_op(VecOperation::SCALAR_ADD_VEC, scalar_a, vec_b, size, use_single_scalar, config, output);
 }
 
 REGISTER_SCALAR_ADD_VEC_BACKEND("CPU", cpu_scalar_add<scalar_t>);
@@ -431,60 +530,161 @@ REGISTER_SCALAR_ADD_VEC_BACKEND("CPU", cpu_scalar_add<scalar_t>);
 /*********************************** Scalar - Vector***********************************/
 template <typename T>
 eIcicleError cpu_scalar_sub(
-  const Device& device, const T* scalar_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output)
+  const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output)
 {
-  return cpu_scalar_vector_op(VecOperation::SCALAR_SUB_VEC, scalar_a, vec_b, n, config, output);
+  return cpu_scalar_vector_op(VecOperation::SCALAR_SUB_VEC, scalar_a, vec_b, size, use_single_scalar, config, output);
 }
 
 REGISTER_SCALAR_SUB_VEC_BACKEND("CPU", cpu_scalar_sub<scalar_t>);
 
-/*********************************** CONVERT MONTGOMERY ***********************************/
+/*********************************** MUL BY SCALAR***********************************/
 template <typename T>
-eIcicleError cpu_convert_montgomery(
-  const Device& device, const T* input, uint64_t n, bool is_into, const VecOpsConfig& config, T* output)
+eIcicleError cpu_scalar_mul(
+  const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output)
 {
-  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
-  for (uint64_t i = 0; i < n; i += NOF_OPERATIONS_PER_TASK) {
-    VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
-    task_p->send_1op_task(
-      is_into ? CONVERT_TO_MONTGOMERY : CONVERT_FROM_MONTGOMERY, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - i),
-      input + i, output + i);
+  return cpu_scalar_vector_op(VecOperation::SCALAR_MUL_VEC, scalar_a, vec_b, size, use_single_scalar, config, output);
+}
+
+REGISTER_SCALAR_MUL_VEC_BACKEND("CPU", cpu_scalar_mul<scalar_t>);
+
+/*********************************** TRANSPOSE ***********************************/
+// template <typename T> todo shanie - remove
+// eIcicleError cpu_matrix_transpose_basic(
+//   const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out)
+// {
+//   ICICLE_ASSERT(mat_in && mat_out && nof_rows != 0 && nof_cols != 0) << "Invalid argument";
+
+//   // Perform the matrix transpose
+//   for (uint32_t i = 0; i < nof_rows; ++i) {
+//     for (uint32_t j = 0; j < nof_cols; ++j) {
+//       mat_out[j * nof_rows + i] = mat_in[i * nof_cols + j];
+//     }
+//   }
+
+//   return eIcicleError::SUCCESS;
+// }
+
+template <typename T>
+eIcicleError cpu_matrix_transpose_batch(
+  const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out)
+{
+  const T* cur_mat_in = mat_in;
+  T* cur_mat_out = mat_out;
+  uint32_t stride = config.columns_batch? config.batch_size : 1;
+  const uint64_t total_elements = static_cast<uint64_t>(nof_rows) * nof_cols;
+  for (uint32_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
+    // Perform the matrix transpose
+    for (uint32_t i = 0; i < nof_rows; ++i) {
+      for (uint32_t j = 0; j < nof_cols; ++j) {
+        cur_mat_out[stride*(j * nof_rows + i)] = cur_mat_in[stride*(i * nof_cols + j)];
+      }
+    }
+    cur_mat_in += (config.columns_batch ? 1 : total_elements);
+    cur_mat_out += (config.columns_batch ? 1 : total_elements);
   }
-  task_manager.wait_done();
+
   return eIcicleError::SUCCESS;
 }
 
-REGISTER_CONVERT_MONTGOMERY_BACKEND("CPU", cpu_convert_montgomery<scalar_t>);
+uint32_t gcd(uint32_t a, uint32_t b) {
+  while (b != 0) {
+    uint32_t temp = b;
+    b = a % b;
+    a = temp;
+  }
+  return a;
+}
 
-#ifdef EXT_FIELD
-REGISTER_VECTOR_ADD_EXT_FIELD_BACKEND("CPU", cpu_vector_add<extension_t>);
-REGISTER_VECTOR_ACCUMULATE_EXT_FIELD_BACKEND("CPU", cpu_vector_accumulate<extension_t>);
-REGISTER_VECTOR_SUB_EXT_FIELD_BACKEND("CPU", cpu_vector_sub<extension_t>);
-REGISTER_VECTOR_MUL_EXT_FIELD_BACKEND("CPU", cpu_vector_mul<extension_t>);
-REGISTER_CONVERT_MONTGOMERY_EXT_FIELD_BACKEND("CPU", cpu_convert_montgomery<extension_t>);
-#endif // EXT_FIELD
+// template <typename T> //TODO shanie - remove
+// void replace_elements(uint32_t start_idx, uint32_t log_nof_rows, uint32_t log_nof_cols, const T* mat_in, T* mat_out) {
+//   uint64_t idx = start_idx;
+
+//   while (true) {
+//     uint64_t new_idx = mersenne_mod(idx << log_nof_rows, log_nof_rows+log_nof_cols); // new_idx = (idx<<log_nof_rows)%((1<<(log_nof_rows+log_nof_cols))-1);
+//     mat_out[new_idx] = mat_in[idx]
+//     if (new_idx == start_idx) {
+//       break;
+//     }
+//     idx = new_idx;
+//   }
+// }
+
+// Recursive function to generate all k-ary necklaces and to replace the elements withing the necklaces
+template <typename T>
+void gen_necklace(uint32_t t, uint32_t p, uint32_t k, uint32_t length, std::vector<uint32_t>& necklace, std::vector<uint64_t>& task_indices) {
+  if (t > length) {
+    if (length % p == 0 && !std::all_of(necklace.begin() + 1, necklace.begin() + length + 1,[first_element = necklace[1]](uint32_t x) { return x == first_element; })) {
+      uint32_t start_idx = 0;
+      uint64_t multiplier = 1;
+      for (int i = length; i >= 1; --i) { // Compute start_idx as the decimal representation of the necklace
+        start_idx += necklace[i] * multiplier;
+        multiplier *= k;
+      }
+      // for (int i = 1; i <= length; ++i) { // Compute start_idx as the decimal representation of the necklace //TODO SHANIE - remove
+      //   start_idx = start_idx + necklace[i] * std::pow(k, length - i);
+      // }
+      task_indices.push_back(start_idx);
+    }
+    return;
+  }
+
+  necklace[t] = necklace[t - p];
+  gen_necklace<T>(t + 1, p, k, length, necklace, task_indices);
+
+  for (int i = necklace[t - p] + 1; i < k; ++i) {
+    necklace[t] = i;
+    gen_necklace<T>(t + 1, t, k, length, necklace, task_indices);
+  }
+}
 
-/*********************************** TRANSPOSE ***********************************/
 template <typename T>
-eIcicleError cpu_matrix_transpose(
+eIcicleError cpu_matrix_transpose_parallel(
   const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out)
 {
-  // Check for invalid arguments
-  if (!mat_in || !mat_out || nof_rows == 0 || nof_cols == 0) { return eIcicleError::INVALID_ARGUMENT; }
+  ICICLE_ASSERT(mat_in && mat_out && nof_rows != 0 && nof_cols != 0) << "Invalid argument";
 
-  // Perform the matrix transpose
-  for (uint32_t i = 0; i < nof_rows; ++i) {
-    for (uint32_t j = 0; j < nof_cols; ++j) {
-      mat_out[j * nof_rows + i] = mat_in[i * nof_cols + j];
-    }
+  // check if the number of rows and columns are powers of 2, if not use the basic transpose
+  if ((nof_rows & (nof_rows - 1)) != 0 || (nof_cols & (nof_cols - 1)) != 0) {
+    cpu_matrix_transpose_batch(device, mat_in, nof_rows, nof_cols, config, mat_out);
+    return eIcicleError::SUCCESS;
   }
 
+  uint32_t log_nof_rows = static_cast<uint32_t>(std::floor(std::log2(nof_rows)));
+  uint32_t log_nof_cols = static_cast<uint32_t>(std::floor(std::log2(nof_cols)));
+  uint32_t gcd_value = gcd(log_nof_rows, log_nof_cols);
+  uint32_t k = 1 << gcd_value; // Base of necklaces
+  uint32_t length = (log_nof_cols + log_nof_rows) / gcd_value; // length of necklaces. Since all are powers of 2, equvalent to (log_nof_cols + log_nof_rows) / gcd_value;
+  const uint64_t max_nof_operations = NOF_OPERATIONS_PER_TASK / length;
+  const uint64_t total_elements = static_cast<uint64_t>(nof_rows) * nof_cols;
+
+  std::vector<uint32_t> necklace(length + 1, 0);
+  std::vector<uint64_t> start_indices_in_mat;    // Collect start indices
+  gen_necklace<T>(1, 1, k, length, necklace, start_indices_in_mat);
+
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
+  for (uint64_t i = 0; i < start_indices_in_mat.size(); i += max_nof_operations) {
+    uint64_t nof_operations = std::min((uint64_t)max_nof_operations, start_indices_in_mat.size() - i);
+    for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
+      VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
+      task_p->send_replace_elements_task(
+        REPLACE_ELEMENTS,
+        config.columns_batch? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements,
+        nof_operations,
+        start_indices_in_mat,
+        i,
+        log_nof_rows,
+        log_nof_cols,
+        config.columns_batch? config.batch_size : 1,
+        config.columns_batch? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements);
+    }
+  }
+  task_manager.wait_done();
   return eIcicleError::SUCCESS;
 }
 
-REGISTER_MATRIX_TRANSPOSE_BACKEND("CPU", cpu_matrix_transpose<scalar_t>);
+REGISTER_MATRIX_TRANSPOSE_BACKEND("CPU", cpu_matrix_transpose_parallel<scalar_t>);
 #ifdef EXT_FIELD
-REGISTER_MATRIX_TRANSPOSE_EXT_FIELD_BACKEND("CPU", cpu_matrix_transpose<extension_t>);
+REGISTER_MATRIX_TRANSPOSE_EXT_FIELD_BACKEND("CPU", cpu_matrix_transpose_parallel<extension_t>);
 #endif // EXT_FIELD
 
 /*********************************** BIT REVERSE ***********************************/
@@ -492,21 +692,26 @@ template <typename T>
 eIcicleError
 cpu_bit_reverse(const Device& device, const T* vec_in, uint64_t size, const VecOpsConfig& config, T* vec_out)
 {
-  // Check for invalid arguments
-  if (!vec_in || !vec_out || size == 0) { return eIcicleError::INVALID_ARGUMENT; }
+  ICICLE_ASSERT(vec_in && vec_out && size != 0) << "Invalid argument";
 
-  // Calculate log2(size)
-  int logn = static_cast<int>(std::floor(std::log2(size)));
-  if ((1ULL << logn) != size) {
-    return eIcicleError::INVALID_ARGUMENT; // Ensure size is a power of 2
-  }
+  uint32_t logn = static_cast<uint32_t>(std::floor(std::log2(size)));
+  ICICLE_ASSERT((1ULL << logn) == size) << "Invalid argument - size is not a power of 2";
 
   // Perform the bit reverse
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
-  for (uint64_t i = 0; i < size; i += NOF_OPERATIONS_PER_TASK) {
-    VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
-    task_p->send_bitrev_task(
-      BIT_REVERSE, logn, i, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - i), vec_in, vec_out);
+  for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
+    for (uint64_t i = 0; i < size; i += NOF_OPERATIONS_PER_TASK) {
+      VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
+
+      task_p->send_bitrev_task(
+        BIT_REVERSE,
+        logn,
+        i,
+        std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - i),
+        config.columns_batch? vec_in + idx_in_batch : vec_in + idx_in_batch*size,
+        config.columns_batch? config.batch_size : 1,
+        config.columns_batch? vec_out + idx_in_batch: vec_out + idx_in_batch*size);
+    }
   }
   task_manager.wait_done();
   return eIcicleError::SUCCESS;
@@ -525,20 +730,27 @@ eIcicleError cpu_slice(
   const T* vec_in,
   uint64_t offset,
   uint64_t stride,
-  uint64_t size,
+  uint64_t size_in,
+  uint64_t size_out,
   const VecOpsConfig& config,
   T* vec_out)
 {
-  if (vec_in == nullptr || vec_out == nullptr) {
-    ICICLE_LOG_ERROR << "Error: Invalid argument - input or output vector is null";
-    return eIcicleError::INVALID_ARGUMENT;
-  }
+
+  ICICLE_ASSERT(vec_in != nullptr && vec_out != nullptr) << "Error: Invalid argument - input or output vector is null";
+  ICICLE_ASSERT(offset + (size_out-1) * stride < size_in) << "Error: Invalid argument - slice out of bound";
 
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
-  for (uint64_t i = 0; i < size; i += NOF_OPERATIONS_PER_TASK) {
-    VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
-    task_p->send_slice_task(
-      SLICE, stride, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - i), vec_in + offset + i * stride, vec_out + i);
+  for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
+    for (uint64_t i = 0; i < size_out; i += NOF_OPERATIONS_PER_TASK) {
+      VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
+      task_p->send_slice_task(
+        SLICE,
+        config.columns_batch? stride*config.batch_size : stride,
+        config.columns_batch? config.batch_size : 1,
+        std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size_out - i),
+        config.columns_batch? vec_in + idx_in_batch + (offset + i * stride)*config.batch_size : vec_in + idx_in_batch*size_in + offset + i * stride,
+        config.columns_batch? vec_out + idx_in_batch + i*config.batch_size : vec_out + idx_in_batch*size_out + i);
+    }
   }
   task_manager.wait_done();
   return eIcicleError::SUCCESS;
@@ -549,6 +761,29 @@ REGISTER_SLICE_BACKEND("CPU", cpu_slice<scalar_t>);
 REGISTER_SLICE_EXT_FIELD_BACKEND("CPU", cpu_slice<extension_t>);
 #endif // EXT_FIELD
 
+/*********************************** Highest non-zero idx ***********************************/
+template <typename T>
+eIcicleError cpu_highest_non_zero_idx(
+  const Device& device, const T* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx /*OUT*/)
+{
+  ICICLE_ASSERT(input && out_idx && size !=0) << "Error: Invalid argument";
+  uint64_t stride = config.columns_batch? config.batch_size : 1;
+  for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) {
+    out_idx[idx_in_batch] = -1; // zero vector is considered '-1' since 0 would be zero in vec[0]
+    const T* curr_input = config.columns_batch? input + idx_in_batch : input + idx_in_batch * size; // Pointer to the current vector
+    for (int64_t i = size - 1; i >= 0; --i) {
+      if (curr_input[i * stride] != T::zero()) {
+        out_idx[idx_in_batch] = i;
+        break;
+      }
+    }
+  }
+  return eIcicleError::SUCCESS;
+}
+
+REGISTER_HIGHEST_NON_ZERO_IDX_BACKEND("CPU", cpu_highest_non_zero_idx<scalar_t>);
+
+
 /*********************************** Polynomial evaluation ***********************************/
 
 template <typename T>
@@ -561,12 +796,18 @@ eIcicleError cpu_poly_eval(
   const VecOpsConfig& config,
   T* evals /*OUT*/)
 {
+  ICICLE_ASSERT(coeffs && domain && evals && coeffs_size != 0 && domain_size != 0) << "Error: Invalid argument";
   // using Horner's method
   // example: ax^2+bx+c is computed as (1) r=a, (2) r=r*x+b, (3) r=r*x+c
-  for (uint64_t eval_idx = 0; eval_idx < domain_size; ++eval_idx) {
-    evals[eval_idx] = coeffs[coeffs_size - 1];
-    for (int64_t coeff_idx = coeffs_size - 2; coeff_idx >= 0; --coeff_idx) {
-      evals[eval_idx] = evals[eval_idx] * domain[eval_idx] + coeffs[coeff_idx];
+  uint64_t stride = config.columns_batch ? config.batch_size : 1;
+  for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) {
+    const T* curr_coeffs = config.columns_batch? coeffs + idx_in_batch : coeffs + idx_in_batch * coeffs_size;
+    T* curr_evals = config.columns_batch? evals + idx_in_batch : evals + idx_in_batch * domain_size;
+    for (uint64_t eval_idx = 0; eval_idx < domain_size; ++eval_idx) {
+      curr_evals[eval_idx * stride] = curr_coeffs[(coeffs_size - 1) * stride];
+      for (int64_t coeff_idx = coeffs_size - 2; coeff_idx >= 0; --coeff_idx) {
+        curr_evals[eval_idx * stride] = curr_evals[eval_idx * stride] * domain[eval_idx] + curr_coeffs[coeff_idx * stride];
+      }
     }
   }
   return eIcicleError::SUCCESS;
@@ -574,23 +815,6 @@ eIcicleError cpu_poly_eval(
 
 REGISTER_POLYNOMIAL_EVAL("CPU", cpu_poly_eval<scalar_t>);
 
-/*********************************** Highest non-zero idx ***********************************/
-template <typename T>
-eIcicleError cpu_highest_non_zero_idx(
-  const Device& device, const T* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx /*OUT*/)
-{
-  *out_idx = -1; // zero vector is considered '-1' since 0 would be zero in vec[0]
-  for (int64_t i = size - 1; i >= 0; --i) {
-    if (input[i] != T::zero()) {
-      *out_idx = i;
-      break;
-    }
-  }
-  return eIcicleError::SUCCESS;
-}
-
-REGISTER_HIGHEST_NON_ZERO_IDX_BACKEND("CPU", cpu_highest_non_zero_idx<scalar_t>);
-
 /*============================== polynomial division ==============================*/
 template <typename T>
 void school_book_division_step_cpu(T* r, T* q, const T* b, int deg_r, int deg_b, const T& lc_b_inv)
@@ -627,21 +851,24 @@ eIcicleError cpu_poly_divide(
   ICICLE_ASSERT(q_size >= (numerator_deg - denumerator_deg + 1))
     << "polynomial division expects q(x) size to be at least deg(numerator)-deg(denumerator)+1";
 
-  ICICLE_CHECK(icicle_copy_async(r_out, numerator, r_size * sizeof(T), config.stream));
-
-  // invert largest coeff of b
-  const T& lc_b_inv = T::inverse(denumerator[denumerator_deg]);
-
-  int64_t deg_r = numerator_deg;
-  while (deg_r >= denumerator_deg) {
-    // each iteration is removing the largest monomial in r until deg(r)<deg(b)
-    school_book_division_step_cpu(r_out, q_out, denumerator, deg_r, denumerator_deg, lc_b_inv);
-
-    // compute degree of r
-    auto degree_config = default_vec_ops_config();
-    cpu_highest_non_zero_idx(device, r_out, deg_r + 1 /*size of R*/, degree_config, &deg_r);
+  ICICLE_CHECK(icicle_copy_async(r_out, numerator, r_size * config.batch_size * sizeof(T), config.stream));
+
+  for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) {
+    const T* curr_denumerator = config.columns_batch? denumerator + idx_in_batch : denumerator + idx_in_batch * (denumerator_deg+1); // Pointer to the current vector
+    T* curr_q_out = config.columns_batch? q_out + idx_in_batch : q_out + idx_in_batch * q_size; // Pointer to the current vector
+    T* curr_r_out = config.columns_batch? r_out + idx_in_batch : r_out + idx_in_batch * r_size; // Pointer to the current vector
+    // invert largest coeff of b
+    const T& lc_b_inv = T::inverse(curr_denumerator[denumerator_deg]);
+    int64_t deg_r = numerator_deg;
+    while (deg_r >= denumerator_deg) {
+      // each iteration is removing the largest monomial in r until deg(r)<deg(b)
+      school_book_division_step_cpu(curr_r_out, curr_q_out, curr_denumerator, deg_r, denumerator_deg, lc_b_inv);
+
+      // compute degree of r
+      auto degree_config = default_vec_ops_config();
+      cpu_highest_non_zero_idx(device, curr_r_out, deg_r + 1 /*size of R*/, degree_config, &deg_r);
+    }
   }
-
   return eIcicleError::SUCCESS;
 }
 
diff --git a/icicle/include/icicle/backend/vec_ops_backend.h b/icicle/include/icicle/backend/vec_ops_backend.h
index 3ce9271e7..50401187c 100644
--- a/icicle/include/icicle/backend/vec_ops_backend.h
+++ b/icicle/include/icicle/backend/vec_ops_backend.h
@@ -10,24 +10,32 @@ namespace icicle {
   using scalarVectorReduceOpImpl = std::function<eIcicleError(
     const Device& device,
     const scalar_t* vec_a,
-    uint64_t n,
+    uint64_t size,
     const VecOpsConfig& config,
-    scalar_t* output,
-    uint64_t offset, 
-    uint64_t  stride)>;
+    scalar_t* output)>;
 
 
 
   using scalarVectorOpImpl = std::function<eIcicleError(
     const Device& device,
-    const scalar_t* vec_a,
+    const scalar_t* scalar_a,
     const scalar_t* vec_b,
-    uint64_t n,
+    uint64_t size,
+    bool use_single_scalar,
     const VecOpsConfig& config,
     scalar_t* output)>;
 
-  using scalarVectorOpImplInplaceA = std::function<eIcicleError(
-    const Device& device, scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config)>;
+
+  using vectorVectorOpImpl = std::function<eIcicleError(
+    const Device& device,
+    const scalar_t* scalar_a,
+    const scalar_t* vec_b,
+    uint64_t size,
+    const VecOpsConfig& config,
+    scalar_t* output)>;
+
+  using vectorVectorOpImplInplaceA = std::function<eIcicleError(
+    const Device& device, scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config)>;
 
   void register_vector_sum(const std::string& deviceType, scalarVectorReduceOpImpl impl);
 
@@ -51,7 +59,7 @@ namespace icicle {
 
 
 
-  void register_vector_add(const std::string& deviceType, scalarVectorOpImpl impl);
+  void register_vector_add(const std::string& deviceType, vectorVectorOpImpl impl);
 
 #define REGISTER_VECTOR_ADD_BACKEND(DEVICE_TYPE, FUNC)                                                                 \
   namespace {                                                                                                          \
@@ -61,7 +69,7 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  void register_vector_accumulate(const std::string& deviceType, scalarVectorOpImplInplaceA impl);
+  void register_vector_accumulate(const std::string& deviceType, vectorVectorOpImplInplaceA impl);
 
 #define REGISTER_VECTOR_ACCUMULATE_BACKEND(DEVICE_TYPE, FUNC)                                                          \
   namespace {                                                                                                          \
@@ -71,7 +79,7 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  void register_vector_sub(const std::string& deviceType, scalarVectorOpImpl impl);
+  void register_vector_sub(const std::string& deviceType, vectorVectorOpImpl impl);
 #define REGISTER_VECTOR_SUB_BACKEND(DEVICE_TYPE, FUNC)                                                                 \
   namespace {                                                                                                          \
     static bool UNIQUE(_reg_vec_sub) = []() -> bool {                                                                  \
@@ -80,7 +88,7 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  void register_vector_mul(const std::string& deviceType, scalarVectorOpImpl impl);
+  void register_vector_mul(const std::string& deviceType, vectorVectorOpImpl impl);
 
 #define REGISTER_VECTOR_MUL_BACKEND(DEVICE_TYPE, FUNC)                                                                 \
   namespace {                                                                                                          \
@@ -90,7 +98,7 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  void register_vector_div(const std::string& deviceType, scalarVectorOpImpl impl);
+  void register_vector_div(const std::string& deviceType, vectorVectorOpImpl impl);
 
 #define REGISTER_VECTOR_DIV_BACKEND(DEVICE_TYPE, FUNC)                                                                 \
   namespace {                                                                                                          \
@@ -134,7 +142,7 @@ namespace icicle {
     const Device& device,
     const scalar_t* input,
     uint64_t size,
-    bool is_into,
+    bool is_to_montgomery,
     const VecOpsConfig& config,
     scalar_t* output)>;
 
@@ -184,7 +192,8 @@ namespace icicle {
     const scalar_t* input,
     uint64_t offset,
     uint64_t stride,
-    uint64_t size,
+    uint64_t size_in,
+    uint64_t size_out,
     const VecOpsConfig& config,
     scalar_t* output)>;
 
@@ -266,12 +275,12 @@ namespace icicle {
     const Device& device,
     const extension_t* vec_a,
     const extension_t* vec_b,
-    uint64_t n,
+    uint64_t size,
     const VecOpsConfig& config,
     extension_t* output)>;
 
   using extFieldVectorOpImplInplaceA = std::function<eIcicleError(
-    const Device& device, extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig& config)>;
+    const Device& device, extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config)>;
 
   void register_extension_vector_add(const std::string& deviceType, extFieldVectorOpImpl impl);
 
@@ -316,7 +325,7 @@ namespace icicle {
     const Device& device,
     const extension_t* input,
     uint64_t size,
-    bool is_into,
+    bool is_to_montgomery,
     const VecOpsConfig& config,
     extension_t* output)>;
 
diff --git a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
index f0643f978..7c0cca845 100644
--- a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
+++ b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
@@ -65,7 +65,7 @@ namespace icicle {
       config.is_async = true;
       config.stream = m_stream;
 
-      ICICLE_CHECK(icicle::slice(in_coeffs, offset, stride, out_size, config, out_coeffs));
+      ICICLE_CHECK(icicle::slice(in_coeffs, offset, stride, size, out_size, config, out_coeffs));
     }
 
     void add_sub(PolyContext& res, PolyContext a, PolyContext b, bool add1_sub0)
@@ -126,7 +126,7 @@ namespace icicle {
         C zero = C::zero();
         config.is_a_on_device = false;
         ICICLE_CHECK(
-          scalar_sub_vec(&zero, b_mem_p + min_op_size, b->get_nof_elements() - min_op_size, config, res_mem_p));
+          scalar_sub_vec(&zero, b_mem_p + min_op_size, b->get_nof_elements() - min_op_size, true, config, res_mem_p));
       }
     }
 
@@ -173,7 +173,7 @@ namespace icicle {
       config.is_result_on_device = true;
       config.is_async = true;
       config.stream = m_stream;
-      icicle::scalar_mul_vec(&scalar, p_elements_p, N, config, out_evals_p);
+      icicle::scalar_mul_vec(&scalar, p_elements_p, N, true, config, out_evals_p);
     }
 
     void multiply_with_padding(PolyContext c, PolyContext a, PolyContext b)
@@ -409,7 +409,7 @@ namespace icicle {
       config.is_async = true;
       config.stream = m_stream;
       icicle::scalar_mul_vec(
-        &v_coset_eval, numerator_evals_reversed_p + N /*second half is the reversed coset*/, N, config,
+        &v_coset_eval, numerator_evals_reversed_p + N /*second half is the reversed coset*/, N, true, config,
         out_evals_reversed_p);
 
       // INTT back from reversed evals on coset to coeffs
@@ -450,7 +450,7 @@ namespace icicle {
       config.is_result_on_device = true;
       config.is_async = true;
       config.stream = m_stream;
-      icicle::scalar_mul_vec(&v_coset_eval, out_evals_reversed_p, N, config, out_evals_reversed_p);
+      icicle::scalar_mul_vec(&v_coset_eval, out_evals_reversed_p, N, true, config, out_evals_reversed_p);
 
       // (3) INTT back from coset to coeffs
       ntt_config.are_inputs_on_device = true;
@@ -547,7 +547,7 @@ namespace icicle {
         config.is_async = true;
         config.stream = m_stream;
         ICICLE_CHECK(
-          icicle::slice(get_context_storage_immutable<I>(p), 0 /*offset*/, stride, domain_size, config, d_evals));
+          icicle::slice(get_context_storage_immutable<I>(p), 0 /*offset*/, stride, poly_size, domain_size, config, d_evals));
       } else {
         ICICLE_CHECK(icicle_memset(d_evals, 0, domain_size * sizeof(I)));
         auto ntt_config = default_ntt_config<D>();
diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h
index e0cf6f7af..42dfca8bd 100644
--- a/icicle/include/icicle/vec_ops.h
+++ b/icicle/include/icicle/vec_ops.h
@@ -17,16 +17,23 @@ namespace icicle {
    * @note APIs with a single input, ignore input b.
    */
   struct VecOpsConfig {
-    icicleStreamHandle stream; /**< Stream for asynchronous execution. */
-    bool is_a_on_device;       /**< True if `a` is on the device, false if it is not. Default value: false. */
-    bool is_b_on_device;       /**< True if `b` is on the device, false if it is not. Default value: false. OPTIONAL. */
-    bool is_result_on_device; /**< If true, the output is preserved on the device, otherwise on the host. Default value:
-                                 false. */
-    bool is_async;            /**< Whether to run the vector operations asynchronously.
-                                   If set to `true`, the function will be non-blocking and synchronization
-                                   must be explicitly managed using `cudaStreamSynchronize` or `cudaDeviceSynchronize`.
-                                   If set to `false`, the function will block the current CPU thread. */
-    ConfigExtension* ext = nullptr; /**< Backend-specific extension. */
+    icicleStreamHandle stream;  /** Stream for asynchronous execution. */
+    bool is_a_on_device;        /** True if `a` is on the device, false if it is not. Default value: false. */
+    bool is_b_on_device;        /** True if `b` is on the device, false if it is not. Default value: false. OPTIONAL. */
+    bool is_result_on_device;   /** If true, the output is preserved on the device, otherwise on the host. Default value:
+                                    false. */
+    bool is_async;              /** Whether to run the vector operations asynchronously.
+                                    If set to `true`, the function will be non-blocking and synchronization
+                                    must be explicitly managed using `cudaStreamSynchronize` or `cudaDeviceSynchronize`.
+                                    If set to `false`, the function will block the current CPU thread. */
+    int batch_size;             /** Number of vectors (or operations) to process in a batch.
+                                    Each vector operation will be performed independently on each batch element.
+                                    Default value: 1. */
+    bool
+      columns_batch;            /** True if the batched vectors are stored as columns in a 2D array (i.e., the vectors are strided in memory as columns of a matrix).
+                                    If false, the batched vectors are stored contiguously in memory (e.g., as rows or in a flat array).
+                                    Default value: false. */
+    ConfigExtension* ext = nullptr; /** Backend-specific extension. */
   };
 
   /**
@@ -42,52 +49,29 @@ namespace icicle {
       false,   // is_b_on_device
       false,   // is_result_on_device
       false,   // is_async
+      1,       // batch_size
+      false,   // columns_batch
     };
     return config;
   }
 
-  // Reduction operations
-
-  /**
-   * @brief Computes the product of all elements in a vector.
-   *
-   * @tparam T Type of the elements in the vector.
-   * @param vec_a Input vector.
-   * @param n Number of elements in the vector.
-   * @param config Configuration for the operation.
-   * @param output Output scalar to store the result.
-   * @return eIcicleError Error code indicating success or failure.
-   */
-
-  template <typename T>
-  eIcicleError vector_product(const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output, uint64_t offset, uint64_t  stride);
-
-  /**
-   * @brief Computes the sum of all elements in a vector.
-   *
-   * @tparam T Type of the elements in the vector.
-   * @param vec_a Input vector.
-   * @param n Number of elements in the vector.
-   * @param config Configuration for the operation.
-   * @param output Output scalar to store the result.
-   * @return eIcicleError Error code indicating success or failure.
-   */
-
-  template <typename T>
-  eIcicleError vector_sum(const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output, uint64_t offset, uint64_t  stride);
-
-
   // Element-wise vector operations
 
   /**
    * @brief Adds two vectors element-wise.
    *
    * @tparam T Type of the elements in the vectors.
-   * @param vec_a Input vector `a`.
-   * @param vec_b Input vector `b`.
-   * @param size Number of elements in the vectors.
+   * @param vec_a Pointer to the first input vector(s).
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously in memory.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param vec_b Pointer to the second input vector(s).
+   *              - The storage layout should match that of `vec_a`.
+   * @param size Number of elements in each vector.
    * @param config Configuration for the operation.
-   * @param output Output vector to store the result.
+   * @param output Pointer to the output vector(s) where the results will be stored.
+   *               The output array should have the same storage layout as the input vectors.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
@@ -97,24 +81,35 @@ namespace icicle {
    * @brief Accumulates the elements of two vectors element-wise and stores the result in the first vector.
    *
    * @tparam T Type of the elements in the vectors.
-   * @param vec_a Input/output vector `a`. The result will be written back to this vector.
-   * @param vec_b Input vector `b`.
-   * @param size Number of elements in the vectors.
+   * @param vec_a Pointer to the first Input/output vector(s). The result will be written back to this vector.
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously in memory.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param vec_b Pointer to the second input vector(s).
+   *              - The storage layout should match that of `vec_a`.
+   * @param size Number of elements in each vector.
    * @param config Configuration for the operation.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
-  eIcicleError vector_accumulate(T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config);
+  eIcicleError vector_accumulate(T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config); // use vector_add (inplace)
 
   /**
    * @brief Subtracts vector `b` from vector `a` element-wise.
    *
    * @tparam T Type of the elements in the vectors.
-   * @param vec_a Input vector `a`.
-   * @param vec_b Input vector `b`.
-   * @param size Number of elements in the vectors.
+   * @param vec_a Pointer to the first input vector(s).
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously in memory.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param vec_b Pointer to the second input vector(s).
+   *              - The storage layout should match that of `vec_a`.
+   * @param size Number of elements in each vector.
    * @param config Configuration for the operation.
-   * @param output Output vector to store the result.
+   * @param output Pointer to the output vector(s) where the results will be stored.
+   *               The output array should have the same storage layout as the input vectors.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
@@ -124,11 +119,17 @@ namespace icicle {
    * @brief Multiplies two vectors element-wise.
    *
    * @tparam T Type of the elements in the vectors.
-   * @param vec_a Input vector `a`.
-   * @param vec_b Input vector `b`.
-   * @param size Number of elements in the vectors.
+   * @param vec_a Pointer to the first input vector(s).
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously in memory.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param vec_b Pointer to the second input vector(s).
+   *              - The storage layout should match that of `vec_a`.
+   * @param size Number of elements in each vector.
    * @param config Configuration for the operation.
-   * @param output Output vector to store the result.
+   * @param output Pointer to the output vector(s) where the results will be stored.
+   *               The output array should have the same storage layout as the input vectors.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
@@ -138,11 +139,17 @@ namespace icicle {
    * @brief Divides vector `a` by vector `b` element-wise.
    *
    * @tparam T Type of the elements in the vectors.
-   * @param vec_a Input vector `a`.
-   * @param vec_b Input vector `b`.
-   * @param size Number of elements in the vectors.
+   * @param vec_a Pointer to the first input vector(s).
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously in memory.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param vec_b Pointer to the second input vector(s).
+   *              - The storage layout should match that of `vec_a`.
+   * @param size Number of elements in each vector.
    * @param config Configuration for the operation.
-   * @param output Output vector to store the result.
+   * @param output Pointer to the output vector(s) where the results will be stored.
+   *               The output array should have the same storage layout as the input vectors.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
@@ -152,15 +159,60 @@ namespace icicle {
    * @brief Converts elements to and from Montgomery form.
    *
    * @tparam T Type of the elements.
-   * @param input Input vector.
-   * @param size Number of elements in the input vector.
-   * @param is_into True to convert into Montgomery form, false to convert out of Montgomery form.
+   * @param input Pointer to the input vector(s).
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously in memory.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param size Number of elements in each vector.
+   * @param is_to_montgomery True to convert into Montgomery form, false to convert out of Montgomery form.
    * @param config Configuration for the operation.
-   * @param output Output vector to store the result.
+   * @param output Pointer to the output vector(s) where the results will be stored.
+   *               The output array should have the same storage layout as the input vectors.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
-  eIcicleError convert_montgomery(const T* input, uint64_t size, bool is_into, const VecOpsConfig& config, T* output);
+  eIcicleError convert_montgomery(const T* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, T* output);
+
+  // Reduction operations
+
+  /**
+   * @brief Computes the sum of all elements in each vector in a batch.
+   *
+   * @tparam T Type of the elements in the vector.
+   * @param vec_a Pointer to the input vector(s).
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param size Number of elements in each vector.
+   * @param config Configuration for the operation.
+   * @param output Pointer to the output array where the results will be stored.
+   * @return eIcicleError Error code indicating success or failure.
+   */
+
+  template <typename T>
+  eIcicleError vector_sum(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output);
+
+  /**
+  * @brief Computes the product of all elements in each vector in the batch.
+  *
+  * @tparam T Type of the elements in the vectors.
+  * @param vec_a Pointer to the input vector(s).
+  *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+  *              - The layout depends on `config.columns_batch`:
+  *                - If `false`, vectors are stored contiguously.
+  *                - If `true`, vectors are stored as columns in a 2D array.
+  * @param size Number of elements in each vector.
+  * @param config Configuration for the operation.
+  * @param output Pointer to the output array where the results will be stored.
+  * @return eIcicleError Error code indicating success or failure.
+  */
+
+  template <typename T>
+  eIcicleError vector_product(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output);
+
+
 
   // Scalar-Vector operations
 
@@ -168,43 +220,66 @@ namespace icicle {
    * @brief Adds a scalar to each element of a vector.
    *
    * @tparam T Type of the elements in the vector and the scalar.
-   * @param scalar_a Input scalar.
-   * @param vec_b Input vector.
-   * @param size Number of elements in the vector.
+   * @param scalar_a Pointer to the input scalar(s).
+   *                 - If `use_single_scalar` is `true`, this should point to a single scalar value.
+   *                 - If `use_single_scalar` is `false`, this should point to an array of scalars with length `config.batch_size`.
+   * @param vec_b Pointer to the input vector(s).   
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param size Number of elements in a vector.
+   * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of scalars (`false`).
    * @param config Configuration for the operation.
-   * @param output Output vector to store the result.
+   * @param output Pointer to the output vector(s) where the results will be stored.
    * @return eIcicleError Error code indicating success or failure.
+    * @note To subtract a scalar from each element of a vector - use scalar_add_vec with negative scalar.
    */
   template <typename T>
-  eIcicleError scalar_add_vec(const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
+  eIcicleError scalar_add_vec(const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output); 
 
   /**
    * @brief Subtracts each element of a vector from a scalar, elementwise (res[i]=scalar-vec[i]).
    *
    * @tparam T Type of the elements in the vector and the scalar.
-   * @param scalar_a Input scalar.
-   * @param vec_b Input vector.
-   * @param size Number of elements in the vector.
+   * @param scalar_a Pointer to Input scalar(s).
+   *                 - If `use_single_scalar` is `true`, this should point to a single scalar value.
+   *                 - If `use_single_scalar` is `false`, this should point to an array of scalars with length `config.batch_size`.
+   * @param vec_b Pointer to the input vector(s).   
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param size Number of elements in a vector.
+   * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of scalars (`false`).
    * @param config Configuration for the operation.
-   * @param output Output vector to store the result.
+   * @param output Pointer to the output vector(s) where the results will be stored.
    * @return eIcicleError Error code indicating success or failure.
+   * @note To subtract a scalar from each element of a vector - use scalar_add_vec with negative scalar.
    */
   template <typename T>
-  eIcicleError scalar_sub_vec(const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
+  eIcicleError scalar_sub_vec(const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output); 
 
   /**
    * @brief Multiplies each element of a vector by a scalar.
    *
    * @tparam T Type of the elements in the vector and the scalar.
-   * @param scalar_a Input scalar.
-   * @param vec_b Input vector.
-   * @param size Number of elements in the vector.
+   * @param scalar_a Pointer to Input scalar(s).
+   *                 - If `use_single_scalar` is `true`, this should point to a single scalar value.
+   *                 - If `use_single_scalar` is `false`, this should point to an array of scalars with length `config.batch_size`.
+   * @param vec_b Pointer to the input vector(s).   
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param size Number of elements in a vector.
+   * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of scalars (`false`).
    * @param config Configuration for the operation.
-   * @param output Output vector to store the result.
+   * @param output Pointer to the output vector(s) where the results will be stored.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
-  eIcicleError scalar_mul_vec(const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
+  eIcicleError scalar_mul_vec(const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output);
 
   // Matrix operations
 
@@ -212,56 +287,70 @@ namespace icicle {
    * @brief Transposes a matrix.
    *
    * @tparam T Type of the elements in the matrix.
-   * @param mat_in Input matrix.
-   * @param nof_rows Number of rows in the input matrix.
-   * @param nof_cols Number of columns in the input matrix.
+   * @param mat_in Pointer to the input matrix or matrices.
+   * @param nof_rows Number of rows in each input matrix.
+   * @param nof_cols Number of columns in each input matrix.
    * @param config Configuration for the operation.
-   * @param mat_out Output matrix to store the result.
+   * @param mat_out Pointer to the output matrix or matrices where the transposed matrices will be stored.
    * @return eIcicleError Error code indicating success or failure.
+   * @note The input matrices are assumed to be stored in row-major order.
+   *       This function transposes an input matrix or a batch of matrices. 
    */
   template <typename T>
   eIcicleError
   matrix_transpose(const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out);
-
+  
+  
   // Miscellaneous operations
 
   /**
-   * @brief Reorders the vector elements based on bit-reverse. That is out[i]=in[bitrev[i]].
+   * @brief Reorders the vector (or batch of vectors) elements based on bit-reverse. That is out[i]=in[bitrev[i]].
    *
    * @tparam T Type of the elements in the vector.
-   * @param vec_in Input vector.
-   * @param size Number of elements in the input vector.
+   * @param vec_in Pointer to the input vector(s).
+  *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+  *              - The layout depends on `config.columns_batch`:
+  *                - If `false`, vectors are stored contiguously.
+  *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param size Number of elements in each vector.
    * @param config Configuration for the operation.
-   * @param vec_out Output vector to store the result.
+   * @param vec_out Pointer to the output vector(s) where the results will be stored.
+   *                The output array should have the same storage layout as the input vectors.
    * @return eIcicleError Error code indicating success or failure.
+   * @note If `vec_in` and `vec_out` point to the same memory location, the operation is performed in-place.
    */
   template <typename T>
   eIcicleError bit_reverse(const T* vec_in, uint64_t size, const VecOpsConfig& config, T* vec_out);
 
   /**
-   * @brief Extracts a slice from a vector.
+   * @brief Extracts a slice from a vector or batch of vectors.
    *
    * @tparam T Type of the elements in the vector.
-   * @param vec_in Input vector.
-   * @param offset Offset from which to start the slice.
+   * @param vec_in Pointer to the input vector(s).
+   * @param offset Offset from which to start the slice in each vector.
    * @param stride Stride between elements in the slice.
-   * @param size Number of elements in the slice.
-   * @param config Configuration for the operation.
-   * @param vec_out Output vector to store the result.
+   * @param size_in Number of elements in one input vector.
+   * @param size_out Number of elements in one input vector.
+    * @param config Configuration for the operation.
+   * @param vec_out Pointer to the output vector(s) where the results will be stored.
+   *                The output array should have the same storage layout as the input vectors.
    * @return eIcicleError Error code indicating success or failure.
+   * @note The total input size is `size_in * config.batch_size`.
+   *       The total output size is `size_out * config.batch_size`.
    */
   template <typename T>
   eIcicleError
-  slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size, const VecOpsConfig& config, T* vec_out);
+  slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size_in, uint64_t size_out, const VecOpsConfig& config, T* vec_out);
 
   /**
-   * @brief Finds the highest non-zero index in a vector.
+   * @brief Finds the highest non-zero index in a vector or batch of vectors.
    *
    * @tparam T Type of the elements in the vector.
-   * @param vec_in Input vector.
-   * @param size Number of elements in the input vector.
+   * @param vec_in Pointer to the input vector(s).
+   * @param size Number of elements in each input vector.
    * @param config Configuration for the operation.
-   * @param out_idx Output index of the highest non-zero element.
+   * @param out_idx Pointer to an array where the output indices of the highest non-zero element in each input vector will be stored.
+   *                The array should have a length of at least `config.batch_size`.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
@@ -271,12 +360,20 @@ namespace icicle {
    * @brief Evaluates a polynomial at given domain points.
    *
    * @tparam T Type of the elements in the polynomial and domain.
-   * @param coeffs Pointer to the array of coefficients of the polynomial.
-   * @param coeffs_size Number of coefficients in the polynomial.
-   * @param domain Pointer to the array of points at which to evaluate the polynomial.
+   * @param coeffs Pointer to the array of coefficients of the polynomial(s).
+   *               - The size of `coeffs` should be `coeffs_size * batch_size`.
+   *               - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored contiguously.
+   *               - If `config.columns_batch` is `true`, coefficients are interleaved.
+   * @param coeffs_size Number of coefficients in each polynomial.
+   * @param domain Pointer to the array of points at which to evaluate the polynomial(s).
+    *               - The same domain is used for all polynomials.
+    *               - The size of `domain` should be `domain_size`.
    * @param domain_size Number of domain points.
    * @param config Configuration for the operation.
    * @param evals Pointer to the array where the evaluated results will be stored. This is an output parameter.
+ *              - The size of `evals` should be `domain_size * batch_size`.
+ *              - If `config.columns_batch` is `false`, results for each polynomial are stored contiguously.
+ *              - If `config.columns_batch` is `true`, results are interleaved.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
@@ -289,19 +386,30 @@ namespace icicle {
     T* evals /*OUT*/);
 
   /**
-   * @brief Divides two polynomials.
+   * @brief Divides two polynomials or batch of couples of polynomials.
    *
    * @tparam T Type of the elements in the polynomials.
-   * @param numerator Pointer to the array of coefficients of the numerator polynomial.
+   * @param numerator Pointer to the array of coefficients of the numerator polynomial(s).
+   *                  - The size of `numerator` should be `(numerator_deg + 1) * batch_size`.
+   *                  - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored contiguously.
+   *                  - If `config.columns_batch` is `true`, coefficients are interleaved.
    * @param numerator_deg Degree of the numerator polynomial.
-   * @param denominator Pointer to the array of coefficients of the denominator polynomial.
+   * @param denominator Pointer to the array of coefficients of the denominator polynomial(s).
+   *                  - Storage layout is similar to `numerator`.
    * @param denominator_deg Degree of the denominator polynomial.
    * @param config Configuration for the operation.
-   * @param q_out Pointer to the array where the quotient will be stored. This is an output parameter.
-   * @param q_size Size of the quotient array.
-   * @param r_out Pointer to the array where the remainder will be stored. This is an output parameter.
+   * @param q_out Pointer to the array where the quotient polynomial(s) will be stored. This is an output parameter.
+   *              - The storage layout should match that of `numerator`.
+   * @param q_size Size of the quotient array for one polynomial.
+   * @param r_out Pointer to the array where the remainder polynomial(s) will be stored. This is an output parameter.
+   *              - The storage layout should match that of `numerator`.
+   *              - The size of `r_out` should be sufficient to hold the remainder coefficients for each polynomial.
    * @param r_size Size of the remainder array.
    * @return eIcicleError Error code indicating success or failure.
+   *
+   * @note The degrees should satisfy `numerator_deg >= denominator_deg`.
+   *       The sizes `q_size` and `r_size` must be at least `numerator_deg - denominator_deg + 1` and `denominator_deg`, respectively.
+   *       The function assumes that the input and output arrays are properly allocated.
    */
   template <typename T>
   eIcicleError polynomial_division(
diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp
index 29ab25ba0..db86e6e73 100644
--- a/icicle/src/vec_ops.cpp
+++ b/icicle/src/vec_ops.cpp
@@ -8,225 +8,225 @@ namespace icicle {
   ICICLE_DISPATCHER_INST(VectorProductDispatcher, vector_product, scalarVectorReduceOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_product)(
-    const scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, scalar_t* output, uint64_t offset, uint64_t  stride)
+    const scalar_t* vec_a, uint64_t size, const VecOpsConfig* config, scalar_t* output)
   {
-    return VectorProductDispatcher::execute(vec_a, n, *config, output, offset, stride);
+    return VectorProductDispatcher::execute(vec_a, size, *config, output);
   }
 
   template <>
   eIcicleError
-  vector_product(const scalar_t* vec_a, uint64_t n, const VecOpsConfig& config, scalar_t* output, uint64_t offset, uint64_t  stride)
+  vector_product(const scalar_t* vec_a, uint64_t size, const VecOpsConfig& config, scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, vector_product)(vec_a, n, &config, output, offset, stride);
+    return CONCAT_EXPAND(FIELD, vector_product)(vec_a, size, &config, output);
   }
 
   /*********************************** REDUCE SUM ****************************/
   ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, scalarVectorReduceOpImpl );
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_sum)(
-    const scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, scalar_t* output, uint64_t offset=0, uint64_t  stride=1)
+    const scalar_t* vec_a, uint64_t size, const VecOpsConfig* config, scalar_t* output)
   {
-    return VectorSumDispatcher::execute(vec_a, n, *config, output, offset, stride);
+    return VectorSumDispatcher::execute(vec_a, size, *config, output);
   }
 
   template <>
   eIcicleError
-  vector_sum(const scalar_t* vec_a, uint64_t n, const VecOpsConfig& config, scalar_t* output, uint64_t offset, uint64_t  stride)
+  vector_sum(const scalar_t* vec_a, uint64_t size, const VecOpsConfig& config, scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, vector_sum)(vec_a, n, &config, output, offset, stride);
+    return CONCAT_EXPAND(FIELD, vector_sum)(vec_a, size, &config, output);
   }
 
   /*********************************** ADD ***********************************/
-  ICICLE_DISPATCHER_INST(VectorAddDispatcher, vector_add, scalarVectorOpImpl);
+  ICICLE_DISPATCHER_INST(VectorAddDispatcher, vector_add, vectorVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_add)(
-    const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output)
   {
-    return VectorAddDispatcher::execute(vec_a, vec_b, n, *config, output);
+    return VectorAddDispatcher::execute(vec_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError
-  vector_add(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output)
+  vector_add(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, vector_add)(vec_a, vec_b, n, &config, output);
+    return CONCAT_EXPAND(FIELD, vector_add)(vec_a, vec_b, size, &config, output);
   }
 
 #ifdef EXT_FIELD
   ICICLE_DISPATCHER_INST(VectorAddExtFieldDispatcher, extension_vector_add, extFieldVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_add)(
-    const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig* config, extension_t* output)
+    const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config, extension_t* output)
   {
-    return VectorAddExtFieldDispatcher::execute(vec_a, vec_b, n, *config, output);
+    return VectorAddExtFieldDispatcher::execute(vec_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError vector_add(
-    const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig& config, extension_t* output)
+    const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config, extension_t* output)
   {
-    return CONCAT_EXPAND(FIELD, extension_vector_add)(vec_a, vec_b, n, &config, output);
+    return CONCAT_EXPAND(FIELD, extension_vector_add)(vec_a, vec_b, size, &config, output);
   }
 #endif // EXT_FIELD
 
   /*********************************** ACCUMULATE ***********************************/
-  ICICLE_DISPATCHER_INST(VectorAccumulateDispatcher, vector_accumulate, scalarVectorOpImplInplaceA);
+  ICICLE_DISPATCHER_INST(VectorAccumulateDispatcher, vector_accumulate, vectorVectorOpImplInplaceA);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_accumulate)(
-    scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config)
+    scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config)
   {
-    return VectorAccumulateDispatcher::execute(vec_a, vec_b, n, *config);
+    return VectorAccumulateDispatcher::execute(vec_a, vec_b, size, *config);
   }
 
   template <>
-  eIcicleError vector_accumulate(scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config)
+  eIcicleError vector_accumulate(scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config)
   {
-    return CONCAT_EXPAND(FIELD, vector_accumulate)(vec_a, vec_b, n, &config);
+    return CONCAT_EXPAND(FIELD, vector_accumulate)(vec_a, vec_b, size, &config);
   }
 
 #ifdef EXT_FIELD
   ICICLE_DISPATCHER_INST(VectorAccumulateExtFieldDispatcher, extension_vector_accumulate, extFieldVectorOpImplInplaceA);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_accumulate)(
-    extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig* config)
+    extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config)
   {
-    return VectorAccumulateExtFieldDispatcher::execute(vec_a, vec_b, n, *config);
+    return VectorAccumulateExtFieldDispatcher::execute(vec_a, vec_b, size, *config);
   }
 
   template <>
-  eIcicleError vector_accumulate(extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig& config)
+  eIcicleError vector_accumulate(extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config)
   {
-    return CONCAT_EXPAND(FIELD, extension_vector_accumulate)(vec_a, vec_b, n, &config);
+    return CONCAT_EXPAND(FIELD, extension_vector_accumulate)(vec_a, vec_b, size, &config);
   }
 #endif // EXT_FIELD
 
   /*********************************** SUB ***********************************/
-  ICICLE_DISPATCHER_INST(VectorSubDispatcher, vector_sub, scalarVectorOpImpl);
+  ICICLE_DISPATCHER_INST(VectorSubDispatcher, vector_sub, vectorVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_sub)(
-    const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output)
   {
-    return VectorSubDispatcher::execute(vec_a, vec_b, n, *config, output);
+    return VectorSubDispatcher::execute(vec_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError
-  vector_sub(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output)
+  vector_sub(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, vector_sub)(vec_a, vec_b, n, &config, output);
+    return CONCAT_EXPAND(FIELD, vector_sub)(vec_a, vec_b, size, &config, output);
   }
 
 #ifdef EXT_FIELD
   ICICLE_DISPATCHER_INST(VectorSubExtFieldDispatcher, extension_vector_sub, extFieldVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_sub)(
-    const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig* config, extension_t* output)
+    const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config, extension_t* output)
   {
-    return VectorSubExtFieldDispatcher::execute(vec_a, vec_b, n, *config, output);
+    return VectorSubExtFieldDispatcher::execute(vec_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError vector_sub(
-    const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig& config, extension_t* output)
+    const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config, extension_t* output)
   {
-    return CONCAT_EXPAND(FIELD, extension_vector_sub)(vec_a, vec_b, n, &config, output);
+    return CONCAT_EXPAND(FIELD, extension_vector_sub)(vec_a, vec_b, size, &config, output);
   }
 #endif // EXT_FIELD
 
   /*********************************** MUL ***********************************/
-  ICICLE_DISPATCHER_INST(VectorMulDispatcher, vector_mul, scalarVectorOpImpl);
+  ICICLE_DISPATCHER_INST(VectorMulDispatcher, vector_mul, vectorVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_mul)(
-    const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output)
   {
-    return VectorMulDispatcher::execute(vec_a, vec_b, n, *config, output);
+    return VectorMulDispatcher::execute(vec_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError
-  vector_mul(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output)
+  vector_mul(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, vector_mul)(vec_a, vec_b, n, &config, output);
+    return CONCAT_EXPAND(FIELD, vector_mul)(vec_a, vec_b, size, &config, output);
   }
 
 #ifdef EXT_FIELD
   ICICLE_DISPATCHER_INST(VectorMulExtFieldDispatcher, extension_vector_mul, extFieldVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_mul)(
-    const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig* config, extension_t* output)
+    const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config, extension_t* output)
   {
-    return VectorMulExtFieldDispatcher::execute(vec_a, vec_b, n, *config, output);
+    return VectorMulExtFieldDispatcher::execute(vec_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError vector_mul(
-    const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig& config, extension_t* output)
+    const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config, extension_t* output)
   {
-    return CONCAT_EXPAND(FIELD, extension_vector_mul)(vec_a, vec_b, n, &config, output);
+    return CONCAT_EXPAND(FIELD, extension_vector_mul)(vec_a, vec_b, size, &config, output);
   }
 #endif // EXT_FIELD
 
   /*********************************** DIV ***********************************/
-  ICICLE_DISPATCHER_INST(VectorDivDispatcher, vector_div, scalarVectorOpImpl);
+  ICICLE_DISPATCHER_INST(VectorDivDispatcher, vector_div, vectorVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_div)(
-    const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output)
   {
-    return VectorDivDispatcher::execute(vec_a, vec_b, n, *config, output);
+    return VectorDivDispatcher::execute(vec_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError
-  vector_div(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output)
+  vector_div(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, vector_div)(vec_a, vec_b, n, &config, output);
+    return CONCAT_EXPAND(FIELD, vector_div)(vec_a, vec_b, size, &config, output);
   }
 
   /*********************************** (Scalar + Vector) ELEMENT WISE ***********************************/
   ICICLE_DISPATCHER_INST(ScalarAddDispatcher, scalar_add_vec, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_add_vec)(
-    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig* config, scalar_t* output)
   {
-    return ScalarAddDispatcher::execute(scalar_a, vec_b, n, *config, output);
+    return ScalarAddDispatcher::execute(scalar_a, vec_b, size, use_single_scalar, *config, output);
   }
 
   template <>
   eIcicleError scalar_add_vec(
-    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output)
+    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, scalar_add_vec)(scalar_a, vec_b, n, &config, output);
+    return CONCAT_EXPAND(FIELD, scalar_add_vec)(scalar_a, vec_b, size, use_single_scalar, &config, output);
   }
 
   /*********************************** (Scalar - Vector) ELEMENT WISE ***********************************/
   ICICLE_DISPATCHER_INST(ScalarSubDispatcher, scalar_sub_vec, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_sub_vec)(
-    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig* config, scalar_t* output)
   {
-    return ScalarSubDispatcher::execute(scalar_a, vec_b, n, *config, output);
+    return ScalarSubDispatcher::execute(scalar_a, vec_b, size, use_single_scalar, *config, output);
   }
 
   template <>
   eIcicleError scalar_sub_vec(
-    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output)
+    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, scalar_sub_vec)(scalar_a, vec_b, n, &config, output);
+    return CONCAT_EXPAND(FIELD, scalar_sub_vec)(scalar_a, vec_b, size, use_single_scalar, &config, output);
   }
   /*********************************** MUL BY SCALAR ***********************************/
   ICICLE_DISPATCHER_INST(ScalarMulDispatcher, scalar_mul_vec, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_mul_vec)(
-    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig* config, scalar_t* output)
   {
-    return ScalarMulDispatcher::execute(scalar_a, vec_b, n, *config, output);
+    return ScalarMulDispatcher::execute(scalar_a, vec_b, size, use_single_scalar, *config, output);
   }
 
   template <>
   eIcicleError scalar_mul_vec(
-    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output)
+    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, scalar_mul_vec)(scalar_a, vec_b, n, &config, output);
+    return CONCAT_EXPAND(FIELD, scalar_mul_vec)(scalar_a, vec_b, size, use_single_scalar, &config, output);
   }
 
   /*********************************** CONVERT MONTGOMERY ***********************************/
@@ -234,16 +234,16 @@ namespace icicle {
   ICICLE_DISPATCHER_INST(ScalarConvertMontgomeryDispatcher, scalar_convert_montgomery, scalarConvertMontgomeryImpl)
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_convert_montgomery)(
-    const scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig* config, scalar_t* output)
   {
-    return ScalarConvertMontgomeryDispatcher::execute(input, size, is_into, *config, output);
+    return ScalarConvertMontgomeryDispatcher::execute(input, size, is_to_montgomery, *config, output);
   }
 
   template <>
   eIcicleError
-  convert_montgomery(const scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig& config, scalar_t* output)
+  convert_montgomery(const scalar_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, scalar_convert_montgomery)(input, size, is_into, &config, output);
+    return CONCAT_EXPAND(FIELD, scalar_convert_montgomery)(input, size, is_to_montgomery, &config, output);
   }
 
 #ifdef EXT_FIELD
@@ -251,16 +251,16 @@ namespace icicle {
     ExtFieldConvertMontgomeryDispatcher, extension_scalar_convert_montgomery, extFieldConvertMontgomeryImpl)
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_scalar_convert_montgomery)(
-    const extension_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, extension_t* output)
+    const extension_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig* config, extension_t* output)
   {
-    return ExtFieldConvertMontgomeryDispatcher::execute(input, size, is_into, *config, output);
+    return ExtFieldConvertMontgomeryDispatcher::execute(input, size, is_to_montgomery, *config, output);
   }
 
   template <>
   eIcicleError convert_montgomery(
-    const extension_t* input, uint64_t size, bool is_into, const VecOpsConfig& config, extension_t* output)
+    const extension_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, extension_t* output)
   {
-    return CONCAT_EXPAND(FIELD, extension_scalar_convert_montgomery)(input, size, is_into, &config, output);
+    return CONCAT_EXPAND(FIELD, extension_scalar_convert_montgomery)(input, size, is_to_montgomery, &config, output);
   }
 #endif // EXT_FIELD
 
@@ -304,11 +304,12 @@ namespace icicle {
     const scalar_t* input,
     uint64_t offset,
     uint64_t stride,
-    uint64_t size,
+    uint64_t size_in,
+    uint64_t size_out,
     const VecOpsConfig* config,
     scalar_t* output)
   {
-    return ScalarSliceDispatcher::execute(input, offset, stride, size, *config, output);
+    return ScalarSliceDispatcher::execute(input, offset, stride, size_in, size_out, *config, output);
   }
 
   template <>
@@ -316,11 +317,12 @@ namespace icicle {
     const scalar_t* input,
     uint64_t offset,
     uint64_t stride,
-    uint64_t size,
+    uint64_t size_in,
+    uint64_t size_out,
     const VecOpsConfig& config,
     scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, slice)(input, offset, stride, size, &config, output);
+    return CONCAT_EXPAND(FIELD, slice)(input, offset, stride, size_in, size_out, &config, output);
   }
 
 #ifdef EXT_FIELD
@@ -350,7 +352,7 @@ namespace icicle {
   }
 #endif // EXT_FIELD
 
-  /*********************************** HIGHEST NON ZERO IDX ***********************************/
+  /*********************************** HIGHEST sizeON ZERO IDX ***********************************/
 
   ICICLE_DISPATCHER_INST(ScalarHighestNonZeroIdxDispatcher, highest_non_zero_idx, scalarHighNonZeroIdxOpImpl)
 
@@ -399,8 +401,8 @@ namespace icicle {
   ICICLE_DISPATCHER_INST(ScalarPolyDivDispatcher, poly_division, scalarPolyDivImpl)
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, poly_division)(
-    const scalar_t* numerator,
-    int64_t numerator_deg,
+    const scalar_t* sizeumerator,
+    int64_t sizeumerator_deg,
     const scalar_t* denumerator,
     int64_t denumerator_deg,
     const VecOpsConfig* config,
@@ -410,13 +412,13 @@ namespace icicle {
     uint64_t r_size)
   {
     return ScalarPolyDivDispatcher::execute(
-      numerator, numerator_deg, denumerator, denumerator_deg, *config, q_out, q_size, r_out, r_size);
+      sizeumerator, sizeumerator_deg, denumerator, denumerator_deg, *config, q_out, q_size, r_out, r_size);
   }
 
   template <>
   eIcicleError polynomial_division(
-    const scalar_t* numerator,
-    int64_t numerator_deg,
+    const scalar_t* sizeumerator,
+    int64_t sizeumerator_deg,
     const scalar_t* denumerator,
     int64_t denumerator_deg,
     const VecOpsConfig& config,
@@ -426,7 +428,7 @@ namespace icicle {
     uint64_t r_size)
   {
     return CONCAT_EXPAND(FIELD, poly_division)(
-      numerator, numerator_deg, denumerator, denumerator_deg, &config, q_out, q_size, r_out, r_size);
+      sizeumerator, sizeumerator_deg, denumerator, denumerator_deg, &config, q_out, q_size, r_out, r_size);
   }
 
-} // namespace icicle
\ No newline at end of file
+} // sizeamespace icicle
\ No newline at end of file

From 1488732c21430ff2036eef8f83b5fe9ebac1d304 Mon Sep 17 00:00:00 2001
From: Shanie Winitz <shanie@ingonyama.com>
Date: Sat, 12 Oct 2024 19:27:19 +0300
Subject: [PATCH 08/43] vec_ops - added: config.batch, parallel transpose,
 tests

---
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp  |  227 ++--
 .../include/icicle/backend/vec_ops_backend.h  |  191 ++--
 icicle/include/icicle/fields/host_math.h      |    2 +-
 .../default_backend/default_poly_backend.h    |   10 +-
 icicle/include/icicle/vec_ops.h               |   16 +-
 icicle/src/vec_ops.cpp                        |   28 +-
 icicle/tests/test_curve_api.cpp               |    3 +-
 icicle/tests/test_field_api.cpp               | 1000 ++++++++++++++---
 8 files changed, 1085 insertions(+), 392 deletions(-)

diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index 952f5108f..a56cdc73c 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -20,16 +20,17 @@ enum VecOperation {
   VECTOR_SUB,
   VECTOR_MUL,
   VECTOR_DIV,
-  VECTOR_SUM,
   CONVERT_TO_MONTGOMERY,
   CONVERT_FROM_MONTGOMERY,
+  VECTOR_SUM,
   VECTOR_PRODUCT,
   SCALAR_ADD_VEC,
   SCALAR_SUB_VEC,
   SCALAR_MUL_VEC,
   BIT_REVERSE,
   SLICE,
-  REPLACE_ELEMENTS, 
+  REPLACE_ELEMENTS,
+  OUT_OF_PLACE_MATRIX_TRANSPOSE,
 
   NOF_OPERATIONS
 };
@@ -80,8 +81,8 @@ class VectorOpTask : public TaskBase
     dispatch();
   }
 
-  // Set the operands for bitrev operation and dispatch the task
-  void send_bitrev_task(
+  // Set the operands for bit_reverse operation and dispatch the task
+  void send_bit_reverse_task(
     VecOperation operation, uint32_t bit_size, uint64_t start_index, const uint32_t nof_operations, const T* op_a, const uint64_t stride, T* output)
   {
     m_operation = operation;
@@ -121,9 +122,22 @@ class VectorOpTask : public TaskBase
     dispatch();
   }
 
+  void send_out_of_place_matrix_transpose_task(VecOperation operation, const T* mat_in, const uint32_t nof_operations, const uint32_t nof_rows, const uint32_t nof_cols, const uint32_t stride, T* mat_out)
+    {
+      m_operation = operation;
+      m_op_a = mat_in;
+      m_nof_operations = nof_operations;
+      m_nof_rows = nof_rows;
+      m_nof_cols = nof_cols;
+      m_stride = stride;
+      m_output = mat_out;
+      dispatch();
+    }
 
   // Execute the selected function based on m_operation
-  virtual void execute() { (this->*functionPtrs[static_cast<size_t>(m_operation)])(); }
+  virtual void execute() {
+    (this->*functionPtrs[static_cast<size_t>(m_operation)])(); 
+  }
 
 private:
   // Single worker functionality to execute vector add (+)
@@ -172,17 +186,17 @@ class VectorOpTask : public TaskBase
   // Single worker functionality to execute sum(vector)
   void vector_sum()
   {
-    m_intermidiate_res[m_idx_in_batch] = T::zero();
+    m_intermidiate_res = T::zero();
     for (uint64_t i = 0; i < (m_stop_index * m_stride); i = i + m_stride) {
-      m_intermidiate_res[m_idx_in_batch] = m_intermidiate_res[m_idx_in_batch] + m_op_a[i];
+      m_intermidiate_res = m_intermidiate_res + m_op_a[i];
     }
   }
   // Single worker functionality to execute product(vector)
   void vector_product()
   {
-    m_intermidiate_res[m_idx_in_batch] = T::one();
+    m_intermidiate_res = T::one();
     for (uint64_t i = 0; i < (m_stop_index * m_stride); i = i + m_stride) {
-      m_intermidiate_res[m_idx_in_batch] = m_intermidiate_res[m_idx_in_batch] * m_op_a[i];
+      m_intermidiate_res = m_intermidiate_res * m_op_a[i];
     }
   }
   // Single worker functionality to execute scalar + vector
@@ -247,8 +261,6 @@ class VectorOpTask : public TaskBase
     while (shifted_idx >= mod) {
       shifted_idx = (shifted_idx & mod) + (shifted_idx >> total_bits);
     }
-    // If shifted_idx == mod, result should be 0 since mod % mod == 0
-    if (shifted_idx == mod) shifted_idx = 0; //TODO SHANIE - check if redundant
     return shifted_idx;
   }
 
@@ -260,15 +272,29 @@ class VectorOpTask : public TaskBase
     for (uint32_t i = 0; i < m_nof_operations; ++i) {
       uint64_t start_idx = (*m_start_indices_in_mat)[m_start_index + i];
       uint64_t idx = start_idx;
+        T prev = m_op_a[m_stride * idx];
       do {
         uint64_t shifted_idx = idx << m_log_nof_rows;
         uint64_t new_idx = mersenne_mod(shifted_idx, total_bits);
-        m_output[m_stride * new_idx] = m_op_a[m_stride * idx];
+        T next = m_op_a[m_stride * new_idx];
+        m_output[m_stride * new_idx] = prev;
+        prev = next;
         idx = new_idx;
       } while (idx != start_idx);
     }
   }
 
+  // Single worker functionality for out of palce matrix transpose
+  void out_of_place_transpose()
+  {
+    for (uint32_t k = 0; k < m_nof_operations; ++k) {
+      for (uint32_t j = 0; j < m_nof_cols; ++j) {
+        m_output[m_stride * (j * m_nof_rows + k)] = m_op_a[m_stride * (k * m_nof_cols + j)];
+      }
+    }
+  }
+
+
 
   // An array of available function pointers arranged according to the VecOperation enum
   using FunctionPtr = void (VectorOpTask::*)();
@@ -286,25 +312,30 @@ class VectorOpTask : public TaskBase
     &VectorOpTask::scalar_mul_vec,          // SCALAR_MUL_VEC,
     &VectorOpTask::bit_reverse,             // BIT_REVERSE
     &VectorOpTask::slice,                   // SLICE
-    &VectorOpTask::replace_elements         // REPLACE_ELEMENTS
+    &VectorOpTask::replace_elements,        // REPLACE_ELEMENTS
+    &VectorOpTask::out_of_place_transpose   // OUT_OF_PLACE_MATRIX_TRANSPOSE
+
+
   };
 
   VecOperation m_operation; // the operation to execute
   uint32_t m_nof_operations;     // number of operations to execute for this task
   const T* m_op_a;          // pointer to operand A. Operand A is a vector, or metrix in case of replace_elements
   const T* m_op_b;          // pointer to operand B. Operand B is a vector or scalar
-  uint64_t m_start_index;   // index used in bitreverse operation
-  uint64_t m_stop_index;    // index used in reduce operations
+  uint64_t m_start_index;   // index used in bitreverse operation and out of place matrix transpose
+  uint64_t m_stop_index;    // index used in reduce operations and out of place matrix transpose
   uint32_t m_bit_size;      // use in bitrev operation
   uint64_t m_stride;        // used to support column batch operations
   uint64_t m_stride_out;    // used in slice operation
   T* m_output;              // pointer to the output. Can be a vector, scalar pointer, or a matrix pointer in case of replace_elements
   uint32_t m_log_nof_rows;  // log of the number of rows in the matrix, used in replace_elements
   uint32_t m_log_nof_cols;  // log of the number of columns in the matrix, used in replace_elements
+  uint32_t m_nof_rows;      // the number of rows in the matrix, used in out of place matrix transpose
+  uint32_t m_nof_cols;      // the number of columns in the matrix, used in out of place matrix transpose
   const std::vector<uint64_t>* m_start_indices_in_mat; // Indices used in replace_elements operations
 
 public:  
-  T* m_intermidiate_res;     // pointer to the output. Can be a vector or scalar pointer
+  T m_intermidiate_res;     // pointer to the output. Can be a vector or scalar pointer
   uint64_t m_idx_in_batch;    // index in the batch. Used in intermidiate res tasks
 }; // class VectorOpTask
 
@@ -325,7 +356,7 @@ template <typename T>
 eIcicleError
 cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
   const uint64_t total_nof_operations = size*config.batch_size;
   for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) {
     VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
@@ -340,7 +371,7 @@ template <typename T>
 eIcicleError cpu_scalar_vector_op(
   VecOperation op, const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output)
 {
-  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
   const uint64_t total_nof_operations = use_single_scalar? size*config.batch_size : size;
   const uint32_t stride = (!use_single_scalar && config.columns_batch)? config.batch_size : 1;
   for (uint32_t idx_in_batch = 0; idx_in_batch < (use_single_scalar? 1 : config.batch_size); idx_in_batch++) {
@@ -416,15 +447,17 @@ template <typename T>
 eIcicleError cpu_convert_montgomery(
   const Device& device, const T* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, T* output)
 {
-  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
   const uint64_t total_nof_operations = size*config.batch_size;
   for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) {
     VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
     task_p->send_1op_task(
-      is_to_montgomery ? CONVERT_TO_MONTGOMERY : CONVERT_FROM_MONTGOMERY, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i),
+      (is_to_montgomery ? CONVERT_TO_MONTGOMERY : CONVERT_FROM_MONTGOMERY), std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i),
       input + i, output + i);
   }
   task_manager.wait_done();
+  for (uint64_t i = 0; i < size*config.batch_size; i++) {
+  }
   return eIcicleError::SUCCESS;
 }
 
@@ -443,7 +476,7 @@ REGISTER_CONVERT_MONTGOMERY_EXT_FIELD_BACKEND("CPU", cpu_convert_montgomery<exte
 template <typename T>
 eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
   std::vector<bool> output_initialized = std::vector<bool>(config.batch_size, false);
   uint64_t vec_a_offset = 0;
   uint64_t idx_in_batch = 0;
@@ -454,7 +487,7 @@ eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t size,
       return eIcicleError::SUCCESS;
     }
     if (task_p->is_completed()) {
-      output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] ? output[task_p->m_idx_in_batch] + task_p->m_intermidiate_res[task_p->m_idx_in_batch] : task_p->m_intermidiate_res[task_p->m_idx_in_batch];
+      output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] ? output[task_p->m_idx_in_batch] + task_p->m_intermidiate_res : task_p->m_intermidiate_res;
       output_initialized[task_p->m_idx_in_batch] = true;
     }
     if (vec_a_offset < size) {
@@ -482,7 +515,7 @@ REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum<scalar_t>);
 template <typename T>
 eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
   std::vector<bool> output_initialized = std::vector<bool>(config.batch_size, false);
   uint64_t vec_a_offset = 0;
   uint64_t idx_in_batch = 0;
@@ -493,7 +526,7 @@ eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t s
       return eIcicleError::SUCCESS;
     }
     if (task_p->is_completed()) {
-      output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] ? output[task_p->m_idx_in_batch] + task_p->m_intermidiate_res[task_p->m_idx_in_batch] : task_p->m_intermidiate_res[task_p->m_idx_in_batch];
+      output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] ? output[task_p->m_idx_in_batch] * task_p->m_intermidiate_res : task_p->m_intermidiate_res;
       output_initialized[task_p->m_idx_in_batch] = true;
     }
     if (vec_a_offset < size) {
@@ -548,41 +581,32 @@ eIcicleError cpu_scalar_mul(
 REGISTER_SCALAR_MUL_VEC_BACKEND("CPU", cpu_scalar_mul<scalar_t>);
 
 /*********************************** TRANSPOSE ***********************************/
-// template <typename T> todo shanie - remove
-// eIcicleError cpu_matrix_transpose_basic(
-//   const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out)
-// {
-//   ICICLE_ASSERT(mat_in && mat_out && nof_rows != 0 && nof_cols != 0) << "Invalid argument";
-
-//   // Perform the matrix transpose
-//   for (uint32_t i = 0; i < nof_rows; ++i) {
-//     for (uint32_t j = 0; j < nof_cols; ++j) {
-//       mat_out[j * nof_rows + i] = mat_in[i * nof_cols + j];
-//     }
-//   }
-
-//   return eIcicleError::SUCCESS;
-// }
 
 template <typename T>
-eIcicleError cpu_matrix_transpose_batch(
+eIcicleError out_of_place_matrix_transpose(
   const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out)
 {
-  const T* cur_mat_in = mat_in;
-  T* cur_mat_out = mat_out;
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
   uint32_t stride = config.columns_batch? config.batch_size : 1;
-  const uint64_t total_elements = static_cast<uint64_t>(nof_rows) * nof_cols;
+  const uint64_t total_elements_one_mat = static_cast<uint64_t>(nof_rows) * nof_cols;
+  const uint32_t NOF_ROWS_PER_TASK = std::min((uint64_t)nof_rows, std::max((uint64_t)(NOF_OPERATIONS_PER_TASK / nof_cols) , (uint64_t)1));
   for (uint32_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
+    const T* cur_mat_in = config.columns_batch? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements_one_mat;
+    T* cur_mat_out = config.columns_batch? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements_one_mat;
     // Perform the matrix transpose
-    for (uint32_t i = 0; i < nof_rows; ++i) {
-      for (uint32_t j = 0; j < nof_cols; ++j) {
-        cur_mat_out[stride*(j * nof_rows + i)] = cur_mat_in[stride*(i * nof_cols + j)];
-      }
+    for (uint32_t i = 0; i < nof_rows; i += NOF_ROWS_PER_TASK) {
+      VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
+      task_p->send_out_of_place_matrix_transpose_task(
+        OUT_OF_PLACE_MATRIX_TRANSPOSE,
+        cur_mat_in + stride*i*nof_cols,
+        std::min((uint64_t)NOF_ROWS_PER_TASK, (uint64_t)nof_rows - i),
+        nof_rows,
+        nof_cols,
+        stride,
+        cur_mat_out + (stride * i));
     }
-    cur_mat_in += (config.columns_batch ? 1 : total_elements);
-    cur_mat_out += (config.columns_batch ? 1 : total_elements);
   }
-
+  task_manager.wait_done();
   return eIcicleError::SUCCESS;
 }
 
@@ -595,20 +619,6 @@ uint32_t gcd(uint32_t a, uint32_t b) {
   return a;
 }
 
-// template <typename T> //TODO shanie - remove
-// void replace_elements(uint32_t start_idx, uint32_t log_nof_rows, uint32_t log_nof_cols, const T* mat_in, T* mat_out) {
-//   uint64_t idx = start_idx;
-
-//   while (true) {
-//     uint64_t new_idx = mersenne_mod(idx << log_nof_rows, log_nof_rows+log_nof_cols); // new_idx = (idx<<log_nof_rows)%((1<<(log_nof_rows+log_nof_cols))-1);
-//     mat_out[new_idx] = mat_in[idx]
-//     if (new_idx == start_idx) {
-//       break;
-//     }
-//     idx = new_idx;
-//   }
-// }
-
 // Recursive function to generate all k-ary necklaces and to replace the elements withing the necklaces
 template <typename T>
 void gen_necklace(uint32_t t, uint32_t p, uint32_t k, uint32_t length, std::vector<uint32_t>& necklace, std::vector<uint64_t>& task_indices) {
@@ -620,9 +630,6 @@ void gen_necklace(uint32_t t, uint32_t p, uint32_t k, uint32_t length, std::vect
         start_idx += necklace[i] * multiplier;
         multiplier *= k;
       }
-      // for (int i = 1; i <= length; ++i) { // Compute start_idx as the decimal representation of the necklace //TODO SHANIE - remove
-      //   start_idx = start_idx + necklace[i] * std::pow(k, length - i);
-      // }
       task_indices.push_back(start_idx);
     }
     return;
@@ -638,53 +645,63 @@ void gen_necklace(uint32_t t, uint32_t p, uint32_t k, uint32_t length, std::vect
 }
 
 template <typename T>
-eIcicleError cpu_matrix_transpose_parallel(
-  const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out)
-{
-  ICICLE_ASSERT(mat_in && mat_out && nof_rows != 0 && nof_cols != 0) << "Invalid argument";
-
-  // check if the number of rows and columns are powers of 2, if not use the basic transpose
-  if ((nof_rows & (nof_rows - 1)) != 0 || (nof_cols & (nof_cols - 1)) != 0) {
-    cpu_matrix_transpose_batch(device, mat_in, nof_rows, nof_cols, config, mat_out);
-    return eIcicleError::SUCCESS;
-  }
-
+eIcicleError matrix_transpose_necklaces(const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out){
   uint32_t log_nof_rows = static_cast<uint32_t>(std::floor(std::log2(nof_rows)));
   uint32_t log_nof_cols = static_cast<uint32_t>(std::floor(std::log2(nof_cols)));
   uint32_t gcd_value = gcd(log_nof_rows, log_nof_cols);
   uint32_t k = 1 << gcd_value; // Base of necklaces
   uint32_t length = (log_nof_cols + log_nof_rows) / gcd_value; // length of necklaces. Since all are powers of 2, equvalent to (log_nof_cols + log_nof_rows) / gcd_value;
   const uint64_t max_nof_operations = NOF_OPERATIONS_PER_TASK / length;
-  const uint64_t total_elements = static_cast<uint64_t>(nof_rows) * nof_cols;
+  const uint64_t total_elements_one_mat = static_cast<uint64_t>(nof_rows) * nof_cols;
 
   std::vector<uint32_t> necklace(length + 1, 0);
   std::vector<uint64_t> start_indices_in_mat;    // Collect start indices
   gen_necklace<T>(1, 1, k, length, necklace, start_indices_in_mat);
 
-  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
   for (uint64_t i = 0; i < start_indices_in_mat.size(); i += max_nof_operations) {
     uint64_t nof_operations = std::min((uint64_t)max_nof_operations, start_indices_in_mat.size() - i);
     for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
       VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
       task_p->send_replace_elements_task(
         REPLACE_ELEMENTS,
-        config.columns_batch? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements,
+        config.columns_batch? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements_one_mat,
         nof_operations,
         start_indices_in_mat,
         i,
         log_nof_rows,
         log_nof_cols,
         config.columns_batch? config.batch_size : 1,
-        config.columns_batch? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements);
+        config.columns_batch? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements_one_mat);
     }
   }
   task_manager.wait_done();
   return eIcicleError::SUCCESS;
 }
 
-REGISTER_MATRIX_TRANSPOSE_BACKEND("CPU", cpu_matrix_transpose_parallel<scalar_t>);
+
+template <typename T>
+eIcicleError cpu_matrix_transpose(
+  const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out)
+{
+  ICICLE_ASSERT(mat_in && mat_out && nof_rows != 0 && nof_cols != 0) << "Invalid argument";
+
+  // check if the number of rows and columns are powers of 2, if not use the basic transpose
+  bool is_power_of_2 = (nof_rows & (nof_rows - 1)) == 0 && (nof_cols & (nof_cols - 1)) == 0;
+  bool is_inplace = mat_in == mat_out;
+  if (!is_inplace) {
+    return(out_of_place_matrix_transpose(device, mat_in, nof_rows, nof_cols, config, mat_out));
+  } else if (is_power_of_2) {
+    return (matrix_transpose_necklaces<T>(mat_in, nof_rows, nof_cols, config, mat_out));
+  } else {
+    ICICLE_LOG_ERROR << "Matrix transpose is not supported for inplace non power of 2 rows and columns";
+    return eIcicleError::INVALID_ARGUMENT;
+  }
+}
+
+REGISTER_MATRIX_TRANSPOSE_BACKEND("CPU", cpu_matrix_transpose<scalar_t>);
 #ifdef EXT_FIELD
-REGISTER_MATRIX_TRANSPOSE_EXT_FIELD_BACKEND("CPU", cpu_matrix_transpose_parallel<extension_t>);
+REGISTER_MATRIX_TRANSPOSE_EXT_FIELD_BACKEND("CPU", cpu_matrix_transpose<extension_t>);
 #endif // EXT_FIELD
 
 /*********************************** BIT REVERSE ***********************************/
@@ -698,12 +715,12 @@ cpu_bit_reverse(const Device& device, const T* vec_in, uint64_t size, const VecO
   ICICLE_ASSERT((1ULL << logn) == size) << "Invalid argument - size is not a power of 2";
 
   // Perform the bit reverse
-  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
   for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
     for (uint64_t i = 0; i < size; i += NOF_OPERATIONS_PER_TASK) {
       VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
 
-      task_p->send_bitrev_task(
+      task_p->send_bit_reverse_task(
         BIT_REVERSE,
         logn,
         i,
@@ -739,7 +756,7 @@ eIcicleError cpu_slice(
   ICICLE_ASSERT(vec_in != nullptr && vec_out != nullptr) << "Error: Invalid argument - input or output vector is null";
   ICICLE_ASSERT(offset + (size_out-1) * stride < size_in) << "Error: Invalid argument - slice out of bound";
 
-  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
   for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
     for (uint64_t i = 0; i < size_out; i += NOF_OPERATIONS_PER_TASK) {
       VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
@@ -817,19 +834,19 @@ REGISTER_POLYNOMIAL_EVAL("CPU", cpu_poly_eval<scalar_t>);
 
 /*============================== polynomial division ==============================*/
 template <typename T>
-void school_book_division_step_cpu(T* r, T* q, const T* b, int deg_r, int deg_b, const T& lc_b_inv)
+void school_book_division_step_cpu(T* r, T* q, const T* b, int deg_r, int deg_b, const T& lc_b_inv, uint32_t stride = 1)
 {
   int64_t monomial = deg_r - deg_b; // monomial=1 is 'x', monomial=2 is x^2 etc.
 
-  T lc_r = r[deg_r];
+  T lc_r = r[deg_r * stride]; // leading coefficient of r
   T monomial_coeff = lc_r * lc_b_inv; // lc_r / lc_b
 
   // adding monomial s to q (q=q+s)
-  q[monomial] = monomial_coeff;
+  q[monomial * stride] = monomial_coeff;
 
   for (int i = monomial; i <= deg_r; ++i) {
-    T b_coeff = b[i - monomial];
-    r[i] = r[i] - monomial_coeff * b_coeff;
+    T b_coeff = b[(i - monomial) * stride];
+    r[i * stride] = r[i * stride] - monomial_coeff * b_coeff;
   }
 }
 
@@ -840,33 +857,37 @@ eIcicleError cpu_poly_divide(
   int64_t numerator_deg,
   const T* denumerator,
   int64_t denumerator_deg,
+  uint64_t q_size,
+  uint64_t r_size,
   const VecOpsConfig& config,
   T* q_out /*OUT*/,
-  uint64_t q_size,
-  T* r_out /*OUT*/,
-  uint64_t r_size)
+  T* r_out /*OUT*/)
 {
   ICICLE_ASSERT(r_size >= numerator_deg)
     << "polynomial division expects r(x) size to be similar to numerator size and higher than numerator degree(x)";
   ICICLE_ASSERT(q_size >= (numerator_deg - denumerator_deg + 1))
     << "polynomial division expects q(x) size to be at least deg(numerator)-deg(denumerator)+1";
 
-  ICICLE_CHECK(icicle_copy_async(r_out, numerator, r_size * config.batch_size * sizeof(T), config.stream));
+  // ICICLE_CHECK(icicle_copy_async(r_out, numerator, r_size * config.batch_size * sizeof(T), config.stream));
+  // copy numerator to r_out // FIXME should it be copied using icicle_copy_async?
+  for (uint64_t i = 0; i < (numerator_deg+1)*config.batch_size; ++i) {
+    r_out[i] = numerator[i];
+  }
 
+  uint32_t stride = config.columns_batch? config.batch_size : 1;
+  auto deg_r = std::make_unique<int64_t[]>(config.batch_size);
   for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) {
     const T* curr_denumerator = config.columns_batch? denumerator + idx_in_batch : denumerator + idx_in_batch * (denumerator_deg+1); // Pointer to the current vector
     T* curr_q_out = config.columns_batch? q_out + idx_in_batch : q_out + idx_in_batch * q_size; // Pointer to the current vector
     T* curr_r_out = config.columns_batch? r_out + idx_in_batch : r_out + idx_in_batch * r_size; // Pointer to the current vector
     // invert largest coeff of b
-    const T& lc_b_inv = T::inverse(curr_denumerator[denumerator_deg]);
-    int64_t deg_r = numerator_deg;
-    while (deg_r >= denumerator_deg) {
+    const T& lc_b_inv = T::inverse(curr_denumerator[denumerator_deg * stride]);
+    deg_r[idx_in_batch] = numerator_deg;
+    while (deg_r[idx_in_batch] >= denumerator_deg) {
       // each iteration is removing the largest monomial in r until deg(r)<deg(b)
-      school_book_division_step_cpu(curr_r_out, curr_q_out, curr_denumerator, deg_r, denumerator_deg, lc_b_inv);
-
+      school_book_division_step_cpu(curr_r_out, curr_q_out, curr_denumerator, deg_r[idx_in_batch], denumerator_deg, lc_b_inv, stride);
       // compute degree of r
-      auto degree_config = default_vec_ops_config();
-      cpu_highest_non_zero_idx(device, curr_r_out, deg_r + 1 /*size of R*/, degree_config, &deg_r);
+      cpu_highest_non_zero_idx(device, r_out, r_size, config, deg_r.get());
     }
   }
   return eIcicleError::SUCCESS;
diff --git a/icicle/include/icicle/backend/vec_ops_backend.h b/icicle/include/icicle/backend/vec_ops_backend.h
index 50401187c..92610798f 100644
--- a/icicle/include/icicle/backend/vec_ops_backend.h
+++ b/icicle/include/icicle/backend/vec_ops_backend.h
@@ -7,14 +7,35 @@ using namespace field_config;
 namespace icicle {
   /*************************** Backend registration ***************************/
 
-  using scalarVectorReduceOpImpl = std::function<eIcicleError(
+  using vectorVectorOpImpl = std::function<eIcicleError(
     const Device& device,
-    const scalar_t* vec_a,
+    const scalar_t* scalar_a,
+    const scalar_t* vec_b,
     uint64_t size,
     const VecOpsConfig& config,
     scalar_t* output)>;
 
+  using vectorVectorOpImplInplaceA = std::function<eIcicleError(
+    const Device& device,
+    scalar_t* vec_a,
+    const scalar_t* vec_b,
+    uint64_t size,
+    const VecOpsConfig& config)>;
+
+  using scalarConvertMontgomeryImpl = std::function<eIcicleError(
+    const Device& device,
+    const scalar_t* input,
+    uint64_t size,
+    bool is_to_montgomery,
+    const VecOpsConfig& config,
+    scalar_t* output)>;
 
+  using VectorReduceOpImpl = std::function<eIcicleError(
+    const Device& device,
+    const scalar_t* vec_a,
+    uint64_t size,
+    const VecOpsConfig& config,
+    scalar_t* output)>;
 
   using scalarVectorOpImpl = std::function<eIcicleError(
     const Device& device,
@@ -25,37 +46,59 @@ namespace icicle {
     const VecOpsConfig& config,
     scalar_t* output)>;
 
+  using scalarMatrixOpImpl = std::function<eIcicleError(
+    const Device& device,
+    const scalar_t* in,
+    uint32_t nof_rows,
+    uint32_t nof_cols,
+    const VecOpsConfig& config,
+    scalar_t* out)>;
 
-  using vectorVectorOpImpl = std::function<eIcicleError(
+  using scalarBitReverseOpImpl = std::function<eIcicleError(
     const Device& device,
-    const scalar_t* scalar_a,
-    const scalar_t* vec_b,
+    const scalar_t* input,
     uint64_t size,
     const VecOpsConfig& config,
     scalar_t* output)>;
 
-  using vectorVectorOpImplInplaceA = std::function<eIcicleError(
-    const Device& device, scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config)>;
+  using scalarSliceOpImpl = std::function<eIcicleError(
+    const Device& device,
+    const scalar_t* input,
+    uint64_t offset,
+    uint64_t stride,
+    uint64_t size_in,
+    uint64_t size_out,
+    const VecOpsConfig& config,
+    scalar_t* output)>;
 
-  void register_vector_sum(const std::string& deviceType, scalarVectorReduceOpImpl impl);
+  using scalarHighNonZeroIdxOpImpl = std::function<eIcicleError(
+    const Device& device,
+    const scalar_t* input,
+    uint64_t size,
+    const VecOpsConfig& config,
+    int64_t* out_idx)>;
 
-#define REGISTER_VECTOR_SUM_BACKEND(DEVICE_TYPE, FUNC)                                                                 \
-  namespace {                                                                                                          \
-    static bool UNIQUE(_reg_vec_sum) = []() -> bool {                                                                  \
-      register_vector_sum(DEVICE_TYPE, FUNC);                                                                          \
-      return true;                                                                                                     \
-    }();                                                                                                               \
-  }
+  using scalarPolyEvalImpl = std::function<eIcicleError(
+    const Device& device,
+    const scalar_t* coeffs,
+    uint64_t coeffs_size,
+    const scalar_t* domain,
+    uint64_t domain_size,
+    const VecOpsConfig& config,
+    scalar_t* evals /*OUT*/)>;
 
-  void register_vector_product(const std::string& deviceType, scalarVectorReduceOpImpl impl);
+  using scalarPolyDivImpl = std::function<eIcicleError(
+    const Device& device,
+    const scalar_t* numerator,
+    int64_t numerator_deg,
+    const scalar_t* denumerator,
+    int64_t denumerator_deg,
+    uint64_t q_size,
+    uint64_t r_size,
+    const VecOpsConfig& config,
+    scalar_t* q_out /*OUT*/,
+    scalar_t* r_out /*OUT*/)>;
 
-#define REGISTER_VECTOR_PRODUCT_BACKEND(DEVICE_TYPE, FUNC)                                                                 \
-  namespace {                                                                                                          \
-    static bool UNIQUE(_reg_vec_product) = []() -> bool {                                                                  \
-      register_vector_product(DEVICE_TYPE, FUNC);                                                                          \
-      return true;                                                                                                     \
-    }();                                                                                                               \
-  }
 
 
 
@@ -108,6 +151,36 @@ namespace icicle {
     }();                                                                                                               \
   }
 
+  void register_scalar_convert_montgomery(const std::string& deviceType, scalarConvertMontgomeryImpl);
+
+#define REGISTER_CONVERT_MONTGOMERY_BACKEND(DEVICE_TYPE, FUNC)                                                         \
+  namespace {                                                                                                          \
+    static bool UNIQUE(_reg_scalar_convert_mont) = []() -> bool {                                                      \
+      register_scalar_convert_montgomery(DEVICE_TYPE, FUNC);                                                           \
+      return true;                                                                                                     \
+    }();                                                                                                               \
+  }
+
+  void register_vector_sum(const std::string& deviceType, VectorReduceOpImpl impl);
+
+#define REGISTER_VECTOR_SUM_BACKEND(DEVICE_TYPE, FUNC)                                                                 \
+  namespace {                                                                                                          \
+    static bool UNIQUE(_reg_vec_sum) = []() -> bool {                                                                  \
+      register_vector_sum(DEVICE_TYPE, FUNC);                                                                          \
+      return true;                                                                                                     \
+    }();                                                                                                               \
+  }
+
+  void register_vector_product(const std::string& deviceType, VectorReduceOpImpl impl);
+
+#define REGISTER_VECTOR_PRODUCT_BACKEND(DEVICE_TYPE, FUNC)                                                                 \
+  namespace {                                                                                                          \
+    static bool UNIQUE(_reg_vec_product) = []() -> bool {                                                                  \
+      register_vector_product(DEVICE_TYPE, FUNC);                                                                          \
+      return true;                                                                                                     \
+    }();                                                                                                               \
+  }
+
   void register_scalar_mul_vec(const std::string& deviceType, scalarVectorOpImpl impl);
 
 #define REGISTER_SCALAR_MUL_VEC_BACKEND(DEVICE_TYPE, FUNC)                                                             \
@@ -138,32 +211,6 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  using scalarConvertMontgomeryImpl = std::function<eIcicleError(
-    const Device& device,
-    const scalar_t* input,
-    uint64_t size,
-    bool is_to_montgomery,
-    const VecOpsConfig& config,
-    scalar_t* output)>;
-
-  void register_scalar_convert_montgomery(const std::string& deviceType, scalarConvertMontgomeryImpl);
-
-#define REGISTER_CONVERT_MONTGOMERY_BACKEND(DEVICE_TYPE, FUNC)                                                         \
-  namespace {                                                                                                          \
-    static bool UNIQUE(_reg_scalar_convert_mont) = []() -> bool {                                                      \
-      register_scalar_convert_montgomery(DEVICE_TYPE, FUNC);                                                           \
-      return true;                                                                                                     \
-    }();                                                                                                               \
-  }
-
-  using scalarMatrixOpImpl = std::function<eIcicleError(
-    const Device& device,
-    const scalar_t* in,
-    uint32_t nof_rows,
-    uint32_t nof_cols,
-    const VecOpsConfig& config,
-    scalar_t* out)>;
-
   void register_matrix_transpose(const std::string& deviceType, scalarMatrixOpImpl impl);
 
 #define REGISTER_MATRIX_TRANSPOSE_BACKEND(DEVICE_TYPE, FUNC)                                                           \
@@ -174,9 +221,6 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  using scalarBitReverseOpImpl = std::function<eIcicleError(
-    const Device& device, const scalar_t* input, uint64_t size, const VecOpsConfig& config, scalar_t* output)>;
-
   void register_scalar_bit_reverse(const std::string& deviceType, scalarBitReverseOpImpl);
 
 #define REGISTER_BIT_REVERSE_BACKEND(DEVICE_TYPE, FUNC)                                                                \
@@ -187,16 +231,6 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  using scalarSliceOpImpl = std::function<eIcicleError(
-    const Device& device,
-    const scalar_t* input,
-    uint64_t offset,
-    uint64_t stride,
-    uint64_t size_in,
-    uint64_t size_out,
-    const VecOpsConfig& config,
-    scalar_t* output)>;
-
   void register_slice(const std::string& deviceType, scalarSliceOpImpl);
 
 #define REGISTER_SLICE_BACKEND(DEVICE_TYPE, FUNC)                                                                      \
@@ -207,9 +241,6 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  using scalarHighNonZeroIdxOpImpl = std::function<eIcicleError(
-    const Device& device, const scalar_t* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx /*OUT*/)>;
-
   void register_highest_non_zero_idx(const std::string& deviceType, scalarHighNonZeroIdxOpImpl);
 
 #define REGISTER_HIGHEST_NON_ZERO_IDX_BACKEND(DEVICE_TYPE, FUNC)                                                       \
@@ -220,24 +251,6 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  template <typename T>
-  eIcicleError polynomial_eval(
-    const T* coeffs,
-    uint64_t coeffs_size,
-    const T* domain,
-    uint64_t domain_size,
-    const VecOpsConfig& config,
-    T* evals /*OUT*/);
-
-  using scalarPolyEvalImpl = std::function<eIcicleError(
-    const Device& device,
-    const scalar_t* coeffs,
-    uint64_t coeffs_size,
-    const scalar_t* domain,
-    uint64_t domain_size,
-    const VecOpsConfig& config,
-    scalar_t* evals /*OUT*/)>;
-
   void register_poly_eval(const std::string& deviceType, scalarPolyEvalImpl);
 
 #define REGISTER_POLYNOMIAL_EVAL(DEVICE_TYPE, FUNC)                                                                    \
@@ -248,18 +261,6 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  using scalarPolyDivImpl = std::function<eIcicleError(
-    const Device& device,
-    const scalar_t* numerator,
-    int64_t numerator_deg,
-    const scalar_t* denumerator,
-    int64_t denumerator_deg,
-    const VecOpsConfig& config,
-    scalar_t* q_out /*OUT*/,
-    uint64_t q_size,
-    scalar_t* r_out /*OUT*/,
-    uint64_t r_size)>;
-
   void register_poly_division(const std::string& deviceType, scalarPolyDivImpl);
 
 #define REGISTER_POLYNOMIAL_DIVISION(DEVICE_TYPE, FUNC)                                                                \
diff --git a/icicle/include/icicle/fields/host_math.h b/icicle/include/icicle/fields/host_math.h
index e256aa922..9ced242d3 100644
--- a/icicle/include/icicle/fields/host_math.h
+++ b/icicle/include/icicle/fields/host_math.h
@@ -288,7 +288,7 @@ namespace host_math {
         r = left_shift<NLIMBS_DENOM, 1>(r);
         r.limbs[0] |= ((num.limbs[limb_idx] >> bit_idx) & 1);
         uint32_t c = add_sub_limbs<NLIMBS_DENOM, true, true, USE_32>(r, denom, temp);
-        if (limb_idx < NLIMBS_Q & !c) {
+        if ((limb_idx < NLIMBS_Q) & !c) {
           r = temp;
           q.limbs[limb_idx] |= 1 << bit_idx;
         }
diff --git a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
index 7c0cca845..0ee0e2d0f 100644
--- a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
+++ b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
@@ -278,7 +278,15 @@ namespace icicle {
       config.is_result_on_device = true;
 
       ICICLE_CHECK(icicle::polynomial_division(
-        a_coeffs, deg_a, b_coeffs, deg_b, config, Q_coeffs, deg_a - deg_b + 1, R_coeffs, a_N));
+        a_coeffs,
+        deg_a,
+        b_coeffs,
+        deg_b,
+        deg_a - deg_b + 1,
+        a_N,
+        config,
+        Q_coeffs,
+        R_coeffs));
     }
 
     void quotient(PolyContext Q, PolyContext op_a, PolyContext op_b) override
diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h
index 42dfca8bd..b89327eb4 100644
--- a/icicle/include/icicle/vec_ops.h
+++ b/icicle/include/icicle/vec_ops.h
@@ -295,6 +295,7 @@ namespace icicle {
    * @return eIcicleError Error code indicating success or failure.
    * @note The input matrices are assumed to be stored in row-major order.
    *       This function transposes an input matrix or a batch of matrices. 
+   *       Matrix transpose inplace is not supported for non-power of 2 rows and columns.
    */
   template <typename T>
   eIcicleError
@@ -312,7 +313,7 @@ namespace icicle {
   *              - The layout depends on `config.columns_batch`:
   *                - If `false`, vectors are stored contiguously.
   *                - If `true`, vectors are stored as columns in a 2D array.
-   * @param size Number of elements in each vector.
+   * @param size Number of elements in each vector. Must be a power of 2.
    * @param config Configuration for the operation.
    * @param vec_out Pointer to the output vector(s) where the results will be stored.
    *                The output array should have the same storage layout as the input vectors.
@@ -337,6 +338,7 @@ namespace icicle {
    * @return eIcicleError Error code indicating success or failure.
    * @note The total input size is `size_in * config.batch_size`.
    *       The total output size is `size_out * config.batch_size`.
+   *       parameters must satisfy: offset + (size_out-1) * stride < size_in
    */
   template <typename T>
   eIcicleError
@@ -350,7 +352,7 @@ namespace icicle {
    * @param size Number of elements in each input vector.
    * @param config Configuration for the operation.
    * @param out_idx Pointer to an array where the output indices of the highest non-zero element in each input vector will be stored.
-   *                The array should have a length of at least `config.batch_size`.
+   *                The array should have a length of `config.batch_size`.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
@@ -398,13 +400,13 @@ namespace icicle {
    *                  - Storage layout is similar to `numerator`.
    * @param denominator_deg Degree of the denominator polynomial.
    * @param config Configuration for the operation.
+   * @param q_size Size of the quotient array for one polynomial.
+   * @param r_size Size of the remainder array.
    * @param q_out Pointer to the array where the quotient polynomial(s) will be stored. This is an output parameter.
    *              - The storage layout should match that of `numerator`.
-   * @param q_size Size of the quotient array for one polynomial.
    * @param r_out Pointer to the array where the remainder polynomial(s) will be stored. This is an output parameter.
    *              - The storage layout should match that of `numerator`.
    *              - The size of `r_out` should be sufficient to hold the remainder coefficients for each polynomial.
-   * @param r_size Size of the remainder array.
    * @return eIcicleError Error code indicating success or failure.
    *
    * @note The degrees should satisfy `numerator_deg >= denominator_deg`.
@@ -417,10 +419,10 @@ namespace icicle {
     int64_t numerator_deg,
     const T* denumerator,
     int64_t denumerator_deg,
+    uint64_t q_size,
+    uint64_t r_size,
     const VecOpsConfig& config,
     T* q_out /*OUT*/,
-    uint64_t q_size,
-    T* r_out /*OUT*/,
-    uint64_t r_size);
+    T* r_out /*OUT*/);
 
 } // namespace icicle
\ No newline at end of file
diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp
index db86e6e73..2c16ed389 100644
--- a/icicle/src/vec_ops.cpp
+++ b/icicle/src/vec_ops.cpp
@@ -5,7 +5,7 @@ namespace icicle {
 
 
   /*********************************** REDUCE PRODUCT ************************/
-  ICICLE_DISPATCHER_INST(VectorProductDispatcher, vector_product, scalarVectorReduceOpImpl);
+  ICICLE_DISPATCHER_INST(VectorProductDispatcher, vector_product, VectorReduceOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_product)(
     const scalar_t* vec_a, uint64_t size, const VecOpsConfig* config, scalar_t* output)
@@ -21,7 +21,7 @@ namespace icicle {
   }
 
   /*********************************** REDUCE SUM ****************************/
-  ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, scalarVectorReduceOpImpl );
+  ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, VectorReduceOpImpl );
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_sum)(
     const scalar_t* vec_a, uint64_t size, const VecOpsConfig* config, scalar_t* output)
@@ -401,34 +401,34 @@ namespace icicle {
   ICICLE_DISPATCHER_INST(ScalarPolyDivDispatcher, poly_division, scalarPolyDivImpl)
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, poly_division)(
-    const scalar_t* sizeumerator,
-    int64_t sizeumerator_deg,
+    const scalar_t* numerator,
+    int64_t numerator_deg,
     const scalar_t* denumerator,
     int64_t denumerator_deg,
+    uint64_t q_size,
+    uint64_t r_size,
     const VecOpsConfig* config,
     scalar_t* q_out /*OUT*/,
-    uint64_t q_size,
-    scalar_t* r_out /*OUT*/,
-    uint64_t r_size)
+    scalar_t* r_out /*OUT*/)
   {
     return ScalarPolyDivDispatcher::execute(
-      sizeumerator, sizeumerator_deg, denumerator, denumerator_deg, *config, q_out, q_size, r_out, r_size);
+      numerator, numerator_deg, denumerator, denumerator_deg, q_size, r_size, *config, q_out, r_out);
   }
 
   template <>
   eIcicleError polynomial_division(
-    const scalar_t* sizeumerator,
-    int64_t sizeumerator_deg,
+    const scalar_t* numerator,
+    int64_t numerator_deg,
     const scalar_t* denumerator,
     int64_t denumerator_deg,
+    uint64_t q_size,
+    uint64_t r_size,
     const VecOpsConfig& config,
     scalar_t* q_out /*OUT*/,
-    uint64_t q_size,
-    scalar_t* r_out /*OUT*/,
-    uint64_t r_size)
+    scalar_t* r_out /*OUT*/)
   {
     return CONCAT_EXPAND(FIELD, poly_division)(
-      sizeumerator, sizeumerator_deg, denumerator, denumerator_deg, &config, q_out, q_size, r_out, r_size);
+      numerator, numerator_deg, denumerator, denumerator_deg, q_size, r_size, &config, q_out, r_out);
   }
 
 } // sizeamespace icicle
\ No newline at end of file
diff --git a/icicle/tests/test_curve_api.cpp b/icicle/tests/test_curve_api.cpp
index 0769df7f9..9fe8bbe67 100644
--- a/icicle/tests/test_curve_api.cpp
+++ b/icicle/tests/test_curve_api.cpp
@@ -190,8 +190,7 @@ TEST_F(CurveApiTest, ecntt)
 
   run(s_main_target, out_main.get(), "ecntt", VERBOSE /*=measure*/, 1 /*=iters*/);
   run(s_ref_target, out_ref.get(), "ecntt", VERBOSE /*=measure*/, 1 /*=iters*/);
-  // ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), N * sizeof(projective_t))); // TODO ucomment when CPU is
-  // implemented
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), N * sizeof(projective_t)));
 }
 #endif // ECNTT
 
diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index 9743c6d2d..5aa9dd973 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -1,7 +1,11 @@
+#include <cstdint>
 #include <gtest/gtest.h>
 #include <iostream>
 #include "dlfcn.h"
+#include <new>
 #include <random>
+#include <cstdlib> // For system
+
 
 #include "icicle/runtime.h"
 #include "icicle/vec_ops.h"
@@ -22,9 +26,11 @@ using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::p
       "%s: %.3f ms\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count() / 1000);
 
 static bool VERBOSE = true;
-static int ITERS = 16;
+static int ITERS = 1;
 static inline std::string s_main_target;
 static inline std::string s_reference_target;
+static const bool s_is_cuda_registered = is_device_registered("CUDA");
+
 
 template <typename T>
 class FieldApiTest : public ::testing::Test
@@ -38,9 +44,8 @@ class FieldApiTest : public ::testing::Test
 #endif
     icicle_load_backend_from_env_or_default();
 
-    const bool is_cuda_registered = is_device_registered("CUDA");
-    if (!is_cuda_registered) { ICICLE_LOG_ERROR << "CUDA device not found. Testing CPU vs CPU"; }
-    s_main_target = is_cuda_registered ? "CUDA" : "CPU";
+    if (!s_is_cuda_registered) { ICICLE_LOG_ERROR << "CUDA device not found. Testing CPU vs reference (on cpu)"; }
+    s_main_target = s_is_cuda_registered ? "CUDA" : "CPU";
     s_reference_target = "CPU";
   }
   static void TearDownTestSuite()
@@ -84,16 +89,20 @@ TYPED_TEST(FieldApiTest, FieldSanityTest)
   ASSERT_EQ(a * scalar_t::from(2), a + a);
 }
 
-TYPED_TEST(FieldApiTest, vectorOps)
-{
-  const uint64_t N = 1 << 22;
-  auto in_a = std::make_unique<TypeParam[]>(N);
-  auto in_b = std::make_unique<TypeParam[]>(N);
-  FieldApiTest<TypeParam>::random_samples(in_a.get(), N);
-  FieldApiTest<TypeParam>::random_samples(in_b.get(), N);
 
-  auto out_main = std::make_unique<TypeParam[]>(N);
-  auto out_ref = std::make_unique<TypeParam[]>(N);
+TYPED_TEST(FieldApiTest, vectorVectorOps)
+{
+  int seed = time(0);
+  srand(seed);
+  ICICLE_LOG_DEBUG << "seed = " << seed;
+  const uint64_t N = 1 << (rand() % 15 + 3);
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
+  const int total_size = N * batch_size;
+  auto in_a = std::make_unique<TypeParam[]>(total_size);
+  auto in_b = std::make_unique<TypeParam[]>(total_size);
+  auto out_main = std::make_unique<TypeParam[]>(total_size);
+  auto  out_ref = std::make_unique<TypeParam[]>(total_size);
 
   auto vector_accumulate_wrapper =
     [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) {
@@ -105,6 +114,8 @@ TYPED_TEST(FieldApiTest, vectorOps)
       Device dev = {dev_type, 0};
       icicle_set_device(dev);
       auto config = default_vec_ops_config();
+      config.batch_size = batch_size;
+      config.columns_batch = columns_batch;
 
       std::ostringstream oss;
       oss << dev_type << " " << msg;
@@ -116,45 +127,307 @@ TYPED_TEST(FieldApiTest, vectorOps)
       END_TIMER(VECADD_sync, oss.str().c_str(), measure);
     };
 
+
   // warmup
   // run(s_reference_target, out_ref.get(), false /*=measure*/, 16 /*=iters*/);
   // run(s_main_target, out_main.get(), false /*=measure*/, 1 /*=iters*/);
+  
+  // warmup
+  // run(s_reference_target, out_ref.get(), false /*=measure*/, 16 /*=iters*/);
+  // run(s_main_target, out_main.get(), false /*=measure*/, 1 /*=iters*/);
+  
+  // Element-wise vector operations
+  // If config.batch_size>1, (columns_batch=true or false) the operation is done element-wise anyway, so it doesn't affect the test
 
-  // accumulate
-  auto temp_result = std::make_unique<TypeParam[]>(N);
-  auto initial_in_a = std::make_unique<TypeParam[]>(N);
-
-  std::memcpy(initial_in_a.get(), in_a.get(), N * sizeof(TypeParam));
-  run(s_reference_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS);
-  std::memcpy(temp_result.get(), in_a.get(), N * sizeof(TypeParam));
-  std::memcpy(in_a.get(), initial_in_a.get(), N * sizeof(TypeParam));
-  run(s_main_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS);
-  ASSERT_EQ(0, memcmp(in_a.get(), temp_result.get(), N * sizeof(TypeParam)));
-
-  // add
-  run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_add<TypeParam>, "vector add", ITERS);
+  // // add
+  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size);
+  FieldApiTest<TypeParam>::random_samples(in_b.get(),total_size);
+  if (!s_is_cuda_registered) {
+    for (int i = 0; i < total_size; i++) {  out_ref[i] = in_a[i] + in_b[i]; }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_add<TypeParam>, "vector add", ITERS);
+  }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_add<TypeParam>, "vector add", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), N * sizeof(TypeParam)));
+  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), total_size * sizeof(TypeParam)));
+  
+  // // accumulate
+  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size);
+  FieldApiTest<TypeParam>::random_samples(in_b.get(),total_size);
+  if (!s_is_cuda_registered) {
+    for (int i = 0; i < total_size; i++) {  out_ref[i] = in_a[i] + in_b[i]; }
+  } else {
+    run(s_reference_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS);
+  }
+  run(s_main_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS);
+  ASSERT_EQ(0, memcmp(in_a.get(),  out_ref.get(), total_size * sizeof(TypeParam)));
 
-  // sub
-  run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sub<TypeParam>, "vector sub", ITERS);
+  // // sub
+  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size);
+  FieldApiTest<TypeParam>::random_samples(in_b.get(),total_size);
+  if (!s_is_cuda_registered) {
+    for (int i = 0; i < total_size; i++) {  out_ref[i] = in_a[i] - in_b[i]; }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sub<TypeParam>, "vector sub", ITERS);
+  }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sub<TypeParam>, "vector sub", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), N * sizeof(TypeParam)));
+  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), total_size * sizeof(TypeParam)));
 
-  // mul
-  run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_mul<TypeParam>, "vector mul", ITERS);
+  // // mul
+  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size);
+  FieldApiTest<TypeParam>::random_samples(in_b.get(),total_size);
+  if (!s_is_cuda_registered) {
+    for (int i = 0; i < total_size; i++) {  out_ref[i] = in_a[i] * in_b[i]; }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_mul<TypeParam>, "vector mul", ITERS);
+  }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_mul<TypeParam>, "vector mul", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), N * sizeof(TypeParam)));
+  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), total_size * sizeof(TypeParam)));
+
+  // // div
+  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size);
+  FieldApiTest<TypeParam>::random_samples(in_b.get(),total_size);
+  // reference
+  if (!s_is_cuda_registered) {
+    for (int i = 0; i < total_size; i++) {  out_ref[i] = in_a[i] * TypeParam::inverse(in_b[i]); }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_div<TypeParam>, "vector div", ITERS);
+  }
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_div<TypeParam>, "vector div", ITERS);
+  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), total_size * sizeof(TypeParam)));
+
 }
 
-TYPED_TEST(FieldApiTest, matrixAPIsAsync)
+TYPED_TEST(FieldApiTest, montgomeryConversion)
 {
-  const int R = 1 << 10, C = 1 << 8;
-  auto h_in = std::make_unique<TypeParam[]>(R * C);
-  FieldApiTest<TypeParam>::random_samples(h_in.get(), R * C);
+  int seed = time(0);
+  srand(seed);
+  // ICICLE_LOG_DEBUG << "seed = " << seed;
+  const uint64_t N = 1 << (rand() % 15 + 3);
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
+  const bool is_to_montgomery = rand() % 2;
+  const int total_size = N * batch_size;
+  auto in_a = std::make_unique<TypeParam[]>(total_size);
+  auto out_main = std::make_unique<TypeParam[]>(total_size);
+  auto  out_ref = std::make_unique<TypeParam[]>(total_size);
+
+  auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) {
+    Device dev = {dev_type, 0};
+    icicle_set_device(dev);
+    auto config = default_vec_ops_config();
+    config.batch_size = batch_size;
+    config.columns_batch = columns_batch;
+
+    std::ostringstream oss;
+    oss << dev_type << " " << msg;
+
+    START_TIMER(MONTGOMERY)
+    for (int i = 0; i < iters; ++i) {
+      ICICLE_CHECK(convert_montgomery(in_a.get(), N, is_to_montgomery, config, out));
+    }
+    END_TIMER(MONTGOMERY, oss.str().c_str(), measure);
+  };
+
+  // Element-wise operation
+  // If config.batch_size>1, (columns_batch=true or false) the addition is done element-wise anyway, so it doesn't affect the test
+
+  // convert_montgomery
+  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size);
+  // reference
+  if (!s_is_cuda_registered) {
+    if (is_to_montgomery) { for (int i = 0; i < total_size; i++) {  out_ref[i] = TypeParam::to_montgomery(in_a[i]); } }
+    else                  { for (int i = 0; i < total_size; i++) {  out_ref[i] = TypeParam::from_montgomery(in_a[i]); } }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "montgomery", ITERS);
+  }
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "montgomery", ITERS);
+  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), total_size * sizeof(TypeParam)));
+}
 
-  auto h_out_main = std::make_unique<TypeParam[]>(R * C);
-  auto h_out_ref = std::make_unique<TypeParam[]>(R * C);
+
+TYPED_TEST(FieldApiTest, VectorReduceOps)
+{
+  int seed = time(0);
+  srand(seed);
+  // ICICLE_LOG_DEBUG << "seed = " << seed;
+  const uint64_t N = 1 << (rand() % 15 + 3);
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
+  const int total_size = N * batch_size;
+  auto in_a = std::make_unique<TypeParam[]>(total_size);
+  auto out_main = std::make_unique<TypeParam[]>(batch_size);
+  auto  out_ref = std::make_unique<TypeParam[]>(batch_size);
+
+  auto vector_accumulate_wrapper =
+    [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) {
+      return vector_accumulate(a, b, size, config);
+    };
+
+  auto run =
+    [&](const std::string& dev_type, TypeParam* out, bool measure, auto vec_op_func, const char* msg, int iters) {
+      Device dev = {dev_type, 0};
+      icicle_set_device(dev);
+      auto config = default_vec_ops_config();
+      config.batch_size = batch_size;
+      config.columns_batch = columns_batch;
+
+      std::ostringstream oss;
+      oss << dev_type << " " << msg;
+
+      START_TIMER(VECADD_sync)
+      for (int i = 0; i < iters; ++i) {
+        ICICLE_CHECK(vec_op_func(in_a.get(), N, config, out));
+      }
+      END_TIMER(VECADD_sync, oss.str().c_str(), measure);
+    };
+  
+  // // sum
+  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size);
+  // reference
+  for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+     out_ref[idx_in_batch] = TypeParam::from(0);
+  }
+  if (!s_is_cuda_registered) {
+    for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+      for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) {
+        uint64_t idx_a = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N;
+        out_ref[idx_in_batch] =  out_ref[idx_in_batch] + in_a[idx_a]; 
+      }
+    }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sum<TypeParam>, "vector sum", ITERS);
+  }
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sum<TypeParam>, "vector sum", ITERS);
+  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), batch_size * sizeof(TypeParam)));
+
+  
+  // // product
+  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size);
+  if (!s_is_cuda_registered) {
+    for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+      out_ref[idx_in_batch] = TypeParam::from(1);
+    }
+    for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+      for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) {
+        uint64_t idx_a = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N;
+        out_ref[idx_in_batch] =  out_ref[idx_in_batch]*in_a[idx_a]; 
+      }
+    }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_product<TypeParam>, "vector product", ITERS);
+  }
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_product<TypeParam>, "vector product", ITERS);
+  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), batch_size * sizeof(TypeParam)));
+}
+
+TYPED_TEST(FieldApiTest, scalarVectorOps)
+{
+  int seed = time(0);
+  srand(seed);
+  // ICICLE_LOG_DEBUG << "seed = " << seed;
+  const uint64_t N = 1 << (rand() % 15 + 3);
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
+  const bool use_single_scalar = rand() % 2;
+  const int total_size = N * batch_size;
+  auto scalar_a = std::make_unique<TypeParam[]>(use_single_scalar? 1 : batch_size);
+  auto in_b = std::make_unique<TypeParam[]>(total_size);
+  auto out_main = std::make_unique<TypeParam[]>(total_size);
+  auto  out_ref = std::make_unique<TypeParam[]>(total_size);
+
+  auto vector_accumulate_wrapper =
+    [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) {
+      return vector_accumulate(a, b, size, config);
+    };
+
+  auto run =
+    [&](const std::string& dev_type, TypeParam* out, bool measure, auto vec_op_func, const char* msg, int iters) {
+      Device dev = {dev_type, 0};
+      icicle_set_device(dev);
+      auto config = default_vec_ops_config();
+      config.batch_size = batch_size;
+      config.columns_batch = columns_batch;
+
+      std::ostringstream oss;
+      oss << dev_type << " " << msg;
+
+      START_TIMER(VECADD_sync)
+      for (int i = 0; i < iters; ++i) {
+        ICICLE_CHECK(vec_op_func(scalar_a.get(), in_b.get(), N, use_single_scalar, config, out));
+      }
+      END_TIMER(VECADD_sync, oss.str().c_str(), measure);
+    };
+  
+  // // scalar add vec
+  FieldApiTest<TypeParam>::random_samples(scalar_a.get(),(use_single_scalar? 1: batch_size));
+  FieldApiTest<TypeParam>::random_samples(in_b.get(),total_size);
+  
+  // reference
+  if (!s_is_cuda_registered) {
+    for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+      for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) {
+        uint64_t idx_b = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N;
+        out_ref[idx_b] = (use_single_scalar? scalar_a[0] : scalar_a[idx_in_batch]) + in_b[idx_b]; 
+      }
+    }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_add_vec<TypeParam>, "scalar add vec", ITERS);
+  }
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_add_vec<TypeParam>, "scalar add vec", ITERS);
+  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), total_size * sizeof(TypeParam)));
+
+  
+  // // scalar sub vec
+  FieldApiTest<TypeParam>::random_samples(scalar_a.get(),(use_single_scalar? 1: batch_size));
+  FieldApiTest<TypeParam>::random_samples(in_b.get(),total_size);
+  
+  if (!s_is_cuda_registered) {
+    for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+      for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) {
+        uint64_t idx_b = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N;
+        out_ref[idx_b] = (use_single_scalar? scalar_a[0] : scalar_a[idx_in_batch]) - in_b[idx_b]; 
+      }
+    }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_sub_vec<TypeParam>, "scalar sub vec", ITERS);
+  }
+
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_sub_vec<TypeParam>, "scalar sub vec", ITERS);
+  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), total_size * sizeof(TypeParam)));
+
+  // // scalar mul vec
+  FieldApiTest<TypeParam>::random_samples(scalar_a.get(),(use_single_scalar? 1: batch_size));
+  FieldApiTest<TypeParam>::random_samples(in_b.get(),total_size);
+  
+  if (!s_is_cuda_registered) {
+    for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+      for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) {
+        uint64_t idx_b = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N;
+        out_ref[idx_b] = (use_single_scalar? scalar_a[0] : scalar_a[idx_in_batch]) * in_b[idx_b]; 
+      }
+    }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_mul_vec<TypeParam>, "scalar mul vec", ITERS);
+  }
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_mul_vec<TypeParam>, "scalar mul vec", ITERS);
+  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), total_size * sizeof(TypeParam)));
+}
+
+TYPED_TEST(FieldApiTest, matrixAPIsAsync)
+{
+  int seed = time(0);
+  srand(seed);
+  // ICICLE_LOG_DEBUG << "seed = " << seed;
+  const int R = 1 << (rand() % 8 + 2); // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2
+  const int C = 1 << (rand() % 8 + 2); // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2
+  const int batch_size = 1 << (rand() % 4);
+  const bool columns_batch = rand() % 2;
+  const bool is_in_place = rand() % 2;
+  // ICICLE_LOG_DEBUG << "R = " << R << ", C = " << C << ", batch_size = " << batch_size << ", columns_batch = " << columns_batch << ", is_in_place = " << is_in_place; //TODO SHANIE - remove this
+  const int total_size = R * C * batch_size;
+  auto h_inout = std::make_unique<TypeParam[]>(total_size);
+  auto h_out_main = std::make_unique<TypeParam[]>(total_size);
+  auto h_out_ref = std::make_unique<TypeParam[]>(total_size);
 
   auto run = [&](const std::string& dev_type, TypeParam* h_out, bool measure, const char* msg, int iters) {
     Device dev = {dev_type, 0};
@@ -163,6 +436,8 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
     DeviceProperties device_props;
     icicle_get_device_properties(device_props);
     auto config = default_vec_ops_config();
+    config.batch_size = batch_size;
+    config.columns_batch = columns_batch;
 
     std::ostringstream oss;
     oss << dev_type << " " << msg;
@@ -174,14 +449,14 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
       icicle_create_stream(&config.stream);
       icicle_malloc_async((void**)&d_in, R * C * sizeof(TypeParam), config.stream);
       icicle_malloc_async((void**)&d_out, R * C * sizeof(TypeParam), config.stream);
-      icicle_copy_to_device_async(d_in, h_in.get(), R * C * sizeof(TypeParam), config.stream);
+      icicle_copy_to_device_async(d_in, h_inout.get(), R * C * sizeof(TypeParam), config.stream);
 
       config.is_a_on_device = true;
       config.is_result_on_device = true;
       config.is_async = false;
     }
 
-    TypeParam* in = device_props.using_host_memory ? h_in.get() : d_in;
+    TypeParam* in = device_props.using_host_memory ? h_inout.get() : d_in;
     TypeParam* out = device_props.using_host_memory ? h_out : d_out;
 
     START_TIMER(TRANSPOSE)
@@ -198,190 +473,577 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
     }
   };
 
-  run(s_reference_target, h_out_ref.get(), VERBOSE /*=measure*/, "transpose", ITERS);
-  run(s_main_target, h_out_main.get(), VERBOSE /*=measure*/, "transpose", ITERS);
-  ASSERT_EQ(0, memcmp(h_out_main.get(), h_out_ref.get(), R * C * sizeof(TypeParam)));
+  // // Option 1: Initialize each input matrix in the batch with the same ascending values
+  // for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+  //   for (uint32_t i = 0; i < R * C; i++) {
+  //     if(columns_batch){
+  //       h_inout[idx_in_batch + batch_size * i] = TypeParam::from(i);
+  //     } else {
+  //       h_inout[idx_in_batch * R * C + i] = TypeParam::from(i);
+  //     }
+  //   }
+  // }
+
+  // // Option 2: Initialize the entire input array with ascending values
+  // for (int i = 0; i < total_size; i++) {
+  //   h_inout[i] = TypeParam::from(i);
+  // }
+
+  // Option 3: Initialize the entire input array with random values
+  FieldApiTest<TypeParam>::random_samples(h_inout.get(),total_size);
+
+  // Reference implementation
+  if (!s_is_cuda_registered) {
+    const TypeParam* cur_mat_in = h_inout.get();
+    TypeParam* cur_mat_out = h_out_ref.get();
+    uint32_t stride = columns_batch? batch_size : 1;
+    const uint64_t total_elements_one_mat = static_cast<uint64_t>(R) * C;
+    for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+      // Perform the matrix transpose
+      for (uint32_t i = 0; i < R; ++i) {
+        for (uint32_t j = 0; j < C; ++j) {
+          cur_mat_out[stride*(j * R + i)] = cur_mat_in[stride*(i * C + j)];
+        }
+      }
+      cur_mat_in += (columns_batch ? 1 : total_elements_one_mat);
+      cur_mat_out += (columns_batch ? 1 : total_elements_one_mat);
+    }
+  } else {
+    run(s_reference_target, (is_in_place? h_inout.get() : h_out_ref.get()), VERBOSE /*=measure*/, "transpose", ITERS);
+  }
+
+  run(s_main_target, (is_in_place? h_inout.get() : h_out_main.get()), VERBOSE /*=measure*/, "transpose", ITERS);
+  if (is_in_place) {
+    ASSERT_EQ(0, memcmp(h_inout.get(), h_out_ref.get(), total_size * sizeof(TypeParam)));
+  } else {
+  // std::cout << "h_out_main:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << h_out_main[i] << ", "; } std::cout <<h_out_main[total_size-1]<<"]"<< std::endl;
+  // std::cout << " h_out_ref:\t["; for (int i = 0; i < total_size-1; i++) { std::cout <<  h_out_ref[i] << ", "; } std::cout << h_out_ref[total_size-1]<<"]"<< std::endl;
+    ASSERT_EQ(0, memcmp(h_out_main.get(), h_out_ref.get(), total_size * sizeof(TypeParam)));
+  // }}//for loop TODO SHANIE - remove this
+  }
 }
 
-TYPED_TEST(FieldApiTest, montgomeryConversion)
+TYPED_TEST(FieldApiTest, bitReverse)
 {
-  const uint64_t N = 1 << 18;
-  auto elements_main = std::make_unique<TypeParam[]>(N);
-  auto elements_ref = std::make_unique<TypeParam[]>(N);
-  FieldApiTest<TypeParam>::random_samples(elements_main.get(), N);
-  memcpy(elements_ref.get(), elements_main.get(), N * sizeof(TypeParam));
+  int seed = time(0);
+  srand(seed);
+  // ICICLE_LOG_DEBUG << "seed = " << seed;
+  const uint64_t N = 1 << (rand() % 15 + 3);
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
+  const bool is_in_place = rand() % 2;
+  const int total_size = N * batch_size;
+
+  // const uint64_t N = 1 << (2);
+  // const int batch_size = 1 << (1);
+  // const bool columns_batch = true;
+  // const bool is_in_place = true;
+  // const int total_size = N * batch_size;
+
+  auto in_a = std::make_unique<TypeParam[]>(total_size);
+  auto out_main = std::make_unique<TypeParam[]>(total_size);
+  auto out_ref = std::make_unique<TypeParam[]>(total_size);
 
-  auto run = [&](const std::string& dev_type, TypeParam* inout, bool measure, const char* msg, int iters) {
+  auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) {
     Device dev = {dev_type, 0};
     icicle_set_device(dev);
     auto config = default_vec_ops_config();
+    config.batch_size = batch_size;
+    config.columns_batch = columns_batch;
 
     std::ostringstream oss;
     oss << dev_type << " " << msg;
 
-    START_TIMER(MONTGOMERY)
+    START_TIMER(BIT_REVERSE)
     for (int i = 0; i < iters; ++i) {
-      ICICLE_CHECK(convert_montgomery(inout, N, true /*into montgomery*/, config, inout));
+      ICICLE_CHECK(bit_reverse(in_a.get(), N, config, out));
     }
-    END_TIMER(MONTGOMERY, oss.str().c_str(), measure);
+    END_TIMER(BIT_REVERSE, oss.str().c_str(), measure);
   };
 
-  run(s_reference_target, elements_main.get(), VERBOSE /*=measure*/, "montgomery", 1);
-  run(s_main_target, elements_ref.get(), VERBOSE /*=measure*/, "montgomery", 1);
-  ASSERT_EQ(0, memcmp(elements_main.get(), elements_ref.get(), N * sizeof(TypeParam)));
+
+  // // Option 1: Initialize each input vector in the batch with the same ascending values
+  // for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+  //   for (uint32_t i = 0; i < N; i++) {
+  //     if(columns_batch){
+  //       in_a[idx_in_batch + batch_size * i] = TypeParam::from(i);
+  //     } else {
+  //       in_a[idx_in_batch * N + i] = TypeParam::from(i);
+  //     }
+  //   }
+  // }
+
+  // // Option 2: Initialize the entire input array with ascending values
+  // for (int i = 0; i < total_size; i++) {
+  //   in_a[i] = TypeParam::from(i);
+  // }
+
+  // Option 3: Initialize the entire input array with random values
+  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size);
+
+
+  // Reference implementation
+  if (!s_is_cuda_registered) {
+    uint64_t logn = 0;
+    uint64_t temp = N;
+    while (temp > 1) {
+        temp >>= 1;
+        logn++;
+    }
+    //BIT REVERSE FUNCTION
+    for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+      for (uint64_t i = 0; i < N; i++) {
+        int rev = 0;
+        for (int j = 0; j < logn; ++j) {
+          if (i & (1 << j)) { rev |= 1 << (logn - 1 - j); }
+        }
+        if(columns_batch){
+          out_ref[idx_in_batch + batch_size * i] = in_a[idx_in_batch + batch_size * rev];
+          // ICICLE_LOG_DEBUG << "out_ref[" << idx_in_batch + batch_size * i << "] = in_a[" << idx_in_batch + batch_size * rev << "]";
+        } else {
+          out_ref[idx_in_batch * N + i] = in_a[idx_in_batch * N + rev];
+          // ICICLE_LOG_DEBUG << "out_ref[" << idx_in_batch * N + i << "] = in_a[" << idx_in_batch * N + rev << "]";
+        }
+      }
+    }
+  } else {
+    run(s_reference_target, (is_in_place? in_a.get() : out_ref.get()), VERBOSE /*=measure*/, "bit-reverse", 1);
+  }
+  run(s_main_target, (is_in_place? in_a.get() : out_main.get()), VERBOSE /*=measure*/, "bit-reverse", 1);
+
+  if (is_in_place) {
+    ASSERT_EQ(0, memcmp(in_a.get(), out_ref.get(), N * sizeof(TypeParam)));
+  } else {
+    // std::cout << "out_main:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout <<out_main[total_size-1]<<"]"<< std::endl;
+    // std::cout << "out_ref:\t["; for (int i = 0; i < total_size-1; i++) { std::cout <<  out_ref[i] << ", "; } std::cout << out_ref[total_size-1]<<"]"<< std::endl;
+    ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
+
+  }
 }
 
-TYPED_TEST(FieldApiTest, bitReverse)
+TYPED_TEST(FieldApiTest, Slice)
 {
-  const uint64_t N = 1 << 18;
-  auto elements_main = std::make_unique<TypeParam[]>(N);
-  auto elements_ref = std::make_unique<TypeParam[]>(N);
-  FieldApiTest<TypeParam>::random_samples(elements_main.get(), N);
-  memcpy(elements_ref.get(), elements_main.get(), N * sizeof(TypeParam));
+  int seed = time(0);
+  srand(seed);
+  // ICICLE_LOG_DEBUG << "seed = " << seed;
+  const uint64_t size_in = 1 << (rand() % 15 + 5);
+  const uint64_t offset = rand() % 15;
+  const uint64_t stride = rand() % 4 + 1;
+  const uint64_t size_out =  rand() % (((size_in - offset)/stride)-1) + 1;
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
+  const int total_size_in = size_in * batch_size;
+  const int total_size_out = size_out * batch_size;
+  // ICICLE_LOG_DEBUG << "size_in = " << size_in << ", offset = " << offset << ", stride = " << stride << ", size_out = " << size_out << ", batch_size = " << batch_size << ", columns_batch = " << columns_batch;
+
+  auto in_a = std::make_unique<TypeParam[]>(total_size_in);
+  auto out_main = std::make_unique<TypeParam[]>(total_size_out);
+  auto out_ref = std::make_unique<TypeParam[]>(total_size_out);
 
-  auto run = [&](const std::string& dev_type, TypeParam* inout, bool measure, const char* msg, int iters) {
+  auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) {
     Device dev = {dev_type, 0};
     icicle_set_device(dev);
     auto config = default_vec_ops_config();
+    config.batch_size = batch_size;
+    config.columns_batch = columns_batch;
 
     std::ostringstream oss;
     oss << dev_type << " " << msg;
 
-    START_TIMER(BIT_REVERSE)
+    START_TIMER(SLICE)
     for (int i = 0; i < iters; ++i) {
-      ICICLE_CHECK(bit_reverse(inout, N, config, inout));
+      ICICLE_CHECK(slice(in_a.get(), offset ,stride ,size_in , size_out , config, out));
     }
-    END_TIMER(BIT_REVERSE, oss.str().c_str(), measure);
+    END_TIMER(SLICE, oss.str().c_str(), measure);
   };
 
-  run(s_reference_target, elements_main.get(), VERBOSE /*=measure*/, "bit-reverse", 1);
-  run(s_main_target, elements_ref.get(), VERBOSE /*=measure*/, "bit-reverse", 1);
-  ASSERT_EQ(0, memcmp(elements_main.get(), elements_ref.get(), N * sizeof(TypeParam)));
-}
+  // // Option 1: Initialize each input vector in the batch with the same ascending values
+  // for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+  //   for (uint32_t i = 0; i < size_in; i++) {
+  //     if(columns_batch){
+  //       in_a[idx_in_batch + batch_size * i] = TypeParam::from(i);
+  //     } else {
+  //       in_a[idx_in_batch * size_in + i] = TypeParam::from(i);
+  //     }
+  //   }
+  // }
 
-TYPED_TEST(FieldApiTest, Slice)
+  // // Option 2: Initialize the entire input array with ascending values
+  // for (int i = 0; i < total_size_in; i++) {
+  //   in_a[i] = TypeParam::from(i);
+  // }
+
+  // Option 3: Initialize the entire input array with random values
+  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size_in);
+
+
+  // Reference implementation
+  if (!s_is_cuda_registered) {
+    for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+      for (uint64_t i = 0; i < size_out; i++) {
+        if(columns_batch){
+          out_ref[idx_in_batch + batch_size * i] = in_a[idx_in_batch + batch_size * (offset + i*stride)];
+        } else {
+          out_ref[idx_in_batch * size_out + i] = in_a[idx_in_batch * size_in + (offset + i*stride)];
+        }
+      }
+    }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "slice", 1);
+  }
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "slice", 1);
+  // std::cout << "out_main\t["; for (int i = 0; i < total_size_out-1; i++) { std::cout << out_main[i] << ", "; } std::cout <<out_main[total_size_out-1]<<"]"<< std::endl;
+  // std::cout << "out_ref:\t["; for (int i = 0; i < total_size_out-1; i++) { std::cout <<  out_ref[i] << ", "; } std::cout << out_ref[total_size_out-1]<<"]"<< std::endl;
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size_out * sizeof(TypeParam)));
+  }
+
+
+
+TYPED_TEST(FieldApiTest, highestNonZeroIdx)
 {
-  const uint64_t N = 1 << 18;
-  const uint64_t offset = 2;
-  const uint64_t stride = 3;
-  const uint64_t size = 4;
+  int seed = time(0);
+  srand(seed);
+  // ICICLE_LOG_DEBUG << "seed = " << seed;
+  const uint64_t N = 1 << (rand() % 15 + 3);
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
+  // const uint64_t N = 1 << (3);
+  // const int batch_size = 1 << (1);
+  // const bool columns_batch = true;
+  const int total_size = N * batch_size;
 
-  auto elements_main = std::make_unique<TypeParam[]>(N);
-  auto elements_ref = std::make_unique<TypeParam[]>(size);
-  auto elements_out = std::make_unique<TypeParam[]>(size);
+  auto in_a = std::make_unique<TypeParam[]>(total_size);
+  auto out_main = std::make_unique<int64_t[]>(batch_size);
+  auto out_ref = std::make_unique<int64_t[]>(batch_size);
 
-  FieldApiTest<TypeParam>::random_samples(elements_main.get(), N);
+  auto run = [&](const std::string& dev_type, int64_t* out, bool measure, const char* msg, int iters) {
+    Device dev = {dev_type, 0};
+    icicle_set_device(dev);
+    auto config = default_vec_ops_config();
+    config.batch_size = batch_size;
+    config.columns_batch = columns_batch;
 
-  auto run =
-    [&](const std::string& dev_type, const TypeParam* in, TypeParam* out, bool measure, const char* msg, int iters) {
-      Device dev = {dev_type, 0};
-      icicle_set_device(dev);
-      auto config = VecOpsConfig(); // Adjust configuration as needed
+    std::ostringstream oss;
+    oss << dev_type << " " << msg;
 
-      std::ostringstream oss;
-      oss << dev_type << " " << msg;
+    START_TIMER(highestNonZeroIdx)
+    for (int i = 0; i < iters; ++i) {
+      ICICLE_CHECK(highest_non_zero_idx(in_a.get(), N , config, out));
+    }
+    END_TIMER(highestNonZeroIdx, oss.str().c_str(), measure);
+  };
 
-      START_TIMER(SLICE)
-      for (int i = 0; i < iters; ++i) {
-        ICICLE_CHECK(slice(in, offset, stride, size, config, out));
+  // Initialize each entire vector with 1 at a random index. The highest non-zero index is the index with 1
+  for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+    if (!s_is_cuda_registered) { out_ref[idx_in_batch] = rand() % N; } // highest_non_zero_idx
+    for (uint32_t i = 0; i < N; i++) {
+      if(columns_batch){
+        in_a[idx_in_batch + batch_size * i] = TypeParam::from(i == out_ref[idx_in_batch] ? 1 : 0);
+      } else {
+        in_a[idx_in_batch * N + i] = TypeParam::from(i == out_ref[idx_in_batch] ? 1 : 0);
       }
-      END_TIMER(SLICE, oss.str().c_str(), measure);
-    };
-
-  run(s_reference_target, elements_main.get(), elements_ref.get(), VERBOSE /*=measure*/, "slice", 1);
-  run(s_main_target, elements_main.get(), elements_out.get(), VERBOSE /*=measure*/, "slice", 1);
-  ASSERT_EQ(0, memcmp(elements_ref.get(), elements_out.get(), size * sizeof(TypeParam)));
-}
+    }
+  }
+  if (s_is_cuda_registered) {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "highest_non_zero_idx", 1);
+  }
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "highest_non_zero_idx", 1);
+  // std::cout << "out_main:\t["; for (int i = 0; i < batch_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout <<out_main[batch_size-1]<<"]"<< std::endl;
+  // std::cout << "out_ref:\t["; for (int i = 0; i < batch_size-1; i++) { std::cout <<  out_ref[i] << ", "; } std::cout << out_ref[batch_size-1]<<"]"<< std::endl;
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(TypeParam)));
+  }
 
-#ifdef NTT
 
-TYPED_TEST(FieldApiTest, ntt)
+TYPED_TEST(FieldApiTest, polynomialEval)
 {
-  // Randomize configuration
-
   int seed = time(0);
   srand(seed);
-  const bool inplace = rand() % 2;
-  const int logn = rand() % 16 + 3;
-  const uint64_t N = 1 << logn;
-  const int log_ntt_domain_size = logn + 1;
-  const int log_batch_size = rand() % 3;
-  const int batch_size = 1 << log_batch_size;
-  const Ordering ordering = static_cast<Ordering>(rand() % 4);
-  bool columns_batch;
-  if (logn == 7 || logn < 4) {
-    columns_batch = false; // currently not supported (icicle_v3/backend/cuda/src/ntt/ntt.cuh line 578)
-  } else {
-    columns_batch = rand() % 2;
+  // ICICLE_LOG_DEBUG << "seed = " << seed;
+  const uint64_t coeffs_size = 1 << (rand() % 10 + 4);
+  const uint64_t domain_size = 1 << (rand() % 8 + 2);
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
+  const int total_coeffs_size = coeffs_size * batch_size;
+
+  auto in_coeffs = std::make_unique<TypeParam[]>(total_coeffs_size);
+  auto in_domain = std::make_unique<TypeParam[]>(domain_size);
+  auto out_main = std::make_unique<TypeParam[]>(total_coeffs_size);
+  auto out_ref = std::make_unique<TypeParam[]>(total_coeffs_size);
+
+  auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) {
+    Device dev = {dev_type, 0};
+    icicle_set_device(dev);
+    auto config = default_vec_ops_config();
+    config.batch_size = batch_size;
+    config.columns_batch = columns_batch;
+
+    std::ostringstream oss;
+    oss << dev_type << " " << msg;
+
+    START_TIMER(polynomialEval)
+    for (int i = 0; i < iters; ++i) {
+      ICICLE_CHECK(polynomial_eval(in_coeffs.get(), coeffs_size, in_domain.get(), domain_size , config, out));
+    }
+    END_TIMER(polynomialEval, oss.str().c_str(), measure);
+  };
+
+  FieldApiTest<TypeParam>::random_samples(in_coeffs.get(), total_coeffs_size);
+  FieldApiTest<TypeParam>::random_samples(in_domain.get(), domain_size);
+
+
+  // Reference implementation
+  // TODO - Check in comperison with GPU implementation 
+
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "polynomial_eval", 1);
+  if (s_is_cuda_registered) {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "polynomial_eval", 1);
+    // std::cout << "out_main:\t["; for (int i = 0; i < total_coeffs_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout <<out_main[total_coeffs_size-1]<<"]"<< std::endl;
+    // std::cout << "out_ref:\t["; for (int i = 0; i < total_coeffs_size-1; i++) { std::cout <<  out_ref[i] << ", "; } std::cout << out_ref[total_coeffs_size-1]<<"]"<< std::endl;
+    ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_coeffs_size * sizeof(TypeParam))); //TODO - Check in comperison with GPU implementation
   }
-  const NTTDir dir = static_cast<NTTDir>(rand() % 2); // 0: forward, 1: inverse
-  const int log_coset_stride = rand() % 3;
-  scalar_t coset_gen;
-  if (log_coset_stride) {
-    coset_gen = scalar_t::omega(logn + log_coset_stride);
-  } else {
-    coset_gen = scalar_t::one();
+
   }
 
-  const int total_size = N * batch_size;
-  auto scalars = std::make_unique<TypeParam[]>(total_size);
-  FieldApiTest<TypeParam>::random_samples(scalars.get(), total_size);
-  auto out_main = std::make_unique<TypeParam[]>(total_size);
-  auto out_ref = std::make_unique<TypeParam[]>(total_size);
-  auto run = [&](const std::string& dev_type, TypeParam* out, const char* msg, bool measure, int iters) {
+
+TYPED_TEST(FieldApiTest, polynomialDivision)
+{
+  int seed = time(0);
+  srand(seed);
+  // ICICLE_LOG_DEBUG << "seed = " << seed;
+  // const int64_t numerator_deg = 1 << 4;
+  // const int64_t denumerator_deg = 1 << 2;
+  // const uint64_t q_size = numerator_deg - denumerator_deg + 1;
+  // const uint64_t r_size = numerator_deg + 1;
+  const int64_t numerator_deg = 3;
+  const int64_t denumerator_deg = 2;
+  const uint64_t q_size = 2;
+  const uint64_t r_size = 4;
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
+
+  const int64_t total_numerator_size = (numerator_deg+1) * batch_size;
+  const int64_t total_denumerator_size = (denumerator_deg+1) * batch_size;
+  const uint64_t total_q_size = q_size * batch_size;
+  const uint64_t total_r_size = r_size * batch_size;
+
+  auto numerator = std::make_unique<TypeParam[]>(total_numerator_size);
+  auto denumerator = std::make_unique<TypeParam[]>(total_denumerator_size);
+  auto q_out_main = std::make_unique<TypeParam[]>(total_q_size);
+  auto r_out_main = std::make_unique<TypeParam[]>(total_r_size);
+  auto q_out_ref = std::make_unique<TypeParam[]>(total_q_size);
+  auto r_out_ref = std::make_unique<TypeParam[]>(total_r_size);
+
+  auto run = [&](const std::string& dev_type, TypeParam* q_out, TypeParam* r_out, bool measure, const char* msg, int iters) {
     Device dev = {dev_type, 0};
     icicle_set_device(dev);
-    icicleStreamHandle stream = nullptr;
-    ICICLE_CHECK(icicle_create_stream(&stream));
-    auto init_domain_config = default_ntt_init_domain_config();
-    init_domain_config.stream = stream;
-    init_domain_config.is_async = false;
-    ConfigExtension ext;
-    ext.set(CudaBackendConfig::CUDA_NTT_FAST_TWIDDLES_MODE, true);
-    init_domain_config.ext = &ext;
-    auto config = default_ntt_config<scalar_t>();
-    config.stream = stream;
-    config.coset_gen = coset_gen;
-    config.batch_size = batch_size;       // default: 1
-    config.columns_batch = columns_batch; // default: false
-    config.ordering = ordering;           // default: kNN
-    config.are_inputs_on_device = true;
-    config.are_outputs_on_device = true;
-    config.is_async = false;
-    ICICLE_CHECK(ntt_init_domain(scalar_t::omega(log_ntt_domain_size), init_domain_config));
-    TypeParam *d_in, *d_out;
-    ICICLE_CHECK(icicle_malloc_async((void**)&d_in, total_size * sizeof(TypeParam), config.stream));
-    ICICLE_CHECK(icicle_malloc_async((void**)&d_out, total_size * sizeof(TypeParam), config.stream));
-    ICICLE_CHECK(icicle_copy_to_device_async(d_in, scalars.get(), total_size * sizeof(TypeParam), config.stream));
+    auto config = default_vec_ops_config();
+    config.batch_size = batch_size;
+    config.columns_batch = columns_batch;
+
+
     std::ostringstream oss;
     oss << dev_type << " " << msg;
-    START_TIMER(NTT_sync)
+
+    START_TIMER(polynomialDivision)
     for (int i = 0; i < iters; ++i) {
-      if (inplace) {
-        ICICLE_CHECK(ntt(d_in, N, dir, config, d_in));
-      } else {
-        ICICLE_CHECK(ntt(d_in, N, dir, config, d_out));
-      }
+      ICICLE_CHECK(polynomial_division(numerator.get(), numerator_deg, denumerator.get(), denumerator_deg , q_size, r_size, config, q_out, r_out));
     }
-    END_TIMER(NTT_sync, oss.str().c_str(), measure);
+    END_TIMER(polynomialDivision, oss.str().c_str(), measure);
+  };
 
-    if (inplace) {
-      ICICLE_CHECK(icicle_copy_to_host_async(out, d_in, total_size * sizeof(TypeParam), config.stream));
+  // // Option 1: Initialize input vectors with random values
+  // FieldApiTest<TypeParam>::random_samples(numerator.get(), total_numerator_size);
+  // FieldApiTest<TypeParam>::random_samples(denumerator.get(), total_denumerator_size);
+  // // Reference implementation 
+  // TODO - Check in comperison with GPU implementation or implement a general reference implementation
+
+  // Option 2: Initialize the numerator and denumerator with chosen example
+  //           And the reference implementation for the example
+
+  for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+    if (columns_batch){
+      // numerator = 3x^3+4x^2+5
+      numerator[idx_in_batch + 0*batch_size] = TypeParam::from(5);
+      numerator[idx_in_batch + 1*batch_size] = TypeParam::from(0);
+      numerator[idx_in_batch + 2*batch_size] = TypeParam::from(4);
+      numerator[idx_in_batch + 3*batch_size] = TypeParam::from(3);
+      // denumerator = x^2-1
+      denumerator[idx_in_batch + 0*batch_size] = TypeParam::from(0) - TypeParam::from(1);
+      denumerator[idx_in_batch + 1*batch_size] = TypeParam::from(0);
+      denumerator[idx_in_batch + 2*batch_size] = TypeParam::from(1);
+      if (!s_is_cuda_registered) {
+        // q_out_ref = 3x+4
+        q_out_ref[idx_in_batch + 0*batch_size] = TypeParam::from(4);
+        q_out_ref[idx_in_batch + 1*batch_size] = TypeParam::from(3);
+        // r_out_ref = 3x+9
+        r_out_ref[idx_in_batch + 0*batch_size] = TypeParam::from(9);
+        r_out_ref[idx_in_batch + 1*batch_size] = TypeParam::from(3);
+      }
     } else {
-      ICICLE_CHECK(icicle_copy_to_host_async(out, d_out, total_size * sizeof(TypeParam), config.stream));
+      // numerator = 3x^3+4x^2+5
+      numerator[idx_in_batch * (numerator_deg+1) + 0] = TypeParam::from(5);
+      numerator[idx_in_batch * (numerator_deg+1) + 1] = TypeParam::from(0);
+      numerator[idx_in_batch * (numerator_deg+1) + 2] = TypeParam::from(4);
+      numerator[idx_in_batch * (numerator_deg+1) + 3] = TypeParam::from(3);
+      // denumerator = x^2-1
+      denumerator[idx_in_batch * (denumerator_deg+1) + 0] = TypeParam::from(0) - TypeParam::from(1);
+      denumerator[idx_in_batch * (denumerator_deg+1) + 1] = TypeParam::from(0);
+      denumerator[idx_in_batch * (denumerator_deg+1) + 2] = TypeParam::from(1);
+      if (!s_is_cuda_registered) {
+        // q_out_ref = 3x+4
+        q_out_ref[idx_in_batch * q_size + 0] = TypeParam::from(4);
+        q_out_ref[idx_in_batch * q_size + 1] = TypeParam::from(3);
+        // r_out_ref = 3x+9
+        r_out_ref[idx_in_batch * r_size + 0] = TypeParam::from(9);
+        r_out_ref[idx_in_batch * r_size + 1] = TypeParam::from(3);
+      }
     }
-    ICICLE_CHECK(icicle_free_async(d_in, config.stream));
-    ICICLE_CHECK(icicle_free_async(d_out, config.stream));
-    ICICLE_CHECK(icicle_stream_synchronize(config.stream));
-    ICICLE_CHECK(icicle_destroy_stream(stream));
-    ICICLE_CHECK(ntt_release_domain<scalar_t>());
-  };
-  run(s_main_target, out_main.get(), "ntt", false /*=measure*/, 1 /*=iters*/); // warmup
-  run(s_reference_target, out_ref.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/);
-  run(s_main_target, out_main.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/);
-  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t)));
+  }
+
+  if (s_is_cuda_registered) {
+    run(s_reference_target, q_out_ref.get(), r_out_ref.get(), VERBOSE /*=measure*/, "polynomial_division", 1);
+  }
+  // std::cout << "numerator:\t["; for (int i = 0; i < total_numerator_size-1; i++) { std::cout << numerator[i] << ", "; } std::cout <<numerator[total_numerator_size-1]<<"]"<< std::endl;
+  // std::cout << "denumerator:\t["; for (int i = 0; i < total_denumerator_size-1; i++) { std::cout << denumerator[i] << ", "; } std::cout <<denumerator[total_denumerator_size-1]<<"]"<< std::endl;
+  // std::cout << "q_out_ref:\t["; for (int i = 0; i < total_q_size-1; i++) { std::cout <<  q_out_ref[i] << ", "; } std::cout << q_out_ref[total_q_size-1]<<"]"<< std::endl;
+  // std::cout << "r_out_ref:\t["; for (int i = 0; i < total_r_size-1; i++) { std::cout <<  r_out_ref[i] << ", "; } std::cout << r_out_ref[total_r_size-1]<<"]"<< std::endl;
+  run(s_main_target, q_out_main.get(), r_out_main.get(), VERBOSE /*=measure*/, "polynomial_division", 1);
+  ASSERT_EQ(0, memcmp(q_out_main.get(), q_out_ref.get(), total_q_size * sizeof(TypeParam)));
+  ASSERT_EQ(0, memcmp(r_out_main.get(), r_out_ref.get(), total_r_size * sizeof(TypeParam)));
 }
-#endif // NTT
+
+// #ifdef NTT
+// TYPED_TEST(FieldApiTest, ntt)
+// {
+//   // ICICLE_LOG_INFO << "Current branch: " << get_current_branch();
+//   ICICLE_LOG_DEBUG << "ICICLE_LOG_DEBUG";
+//   // for (int i = 3; i < 23; ++i) {
+//   // //Randomize configuration
+
+//   // int seed = time(0) + i;
+//   // // int seed = 1726493105;
+//   // srand(seed);
+//   // const bool inplace = rand() % 2;
+//   // const int logn = rand() % 17 + 3;
+//   // // const int logn = rand() % 14 + 3;
+//   // // const int logn = 16;
+//   // const uint64_t N = 1 << logn;
+//   // const int log_ntt_domain_size = logn + 1;
+//   // const int log_batch_size = rand() % 3;
+//   // const int batch_size = 1 << log_batch_size;
+//   // const Ordering ordering = static_cast<Ordering>(rand() % 4);
+//   // bool columns_batch;
+//   // if (logn == 7 || logn < 4) {
+//   //   columns_batch = false; // currently not supported (icicle_v3/backend/cuda/src/ntt/ntt.cuh line 578)
+//   // } else {
+//   //   // columns_batch = true;
+//   //   columns_batch = rand() % 2;
+//   // }
+//   // // const NTTDir dir = static_cast<NTTDir>(rand() % 2); // 0: forward, 1: inverse
+//   // const NTTDir dir = static_cast<NTTDir>(0); // 0: forward, 1: inverse
+//   // const int log_coset_stride = rand() % 3;
+//   // scalar_t coset_gen;
+//   // if (log_coset_stride) {
+//   //   coset_gen = scalar_t::omega(logn + log_coset_stride);
+//   // } else {
+//   //   coset_gen = scalar_t::one();
+//   // }
+
+//   const bool inplace = false;
+//   const int logn = 15;
+//   const uint64_t N = 1 << logn;
+//   const int log_ntt_domain_size = logn;
+//   const int log_batch_size = 0;
+//   const int batch_size = 1 << log_batch_size;
+//   const Ordering ordering = static_cast<Ordering>(0);
+//   bool columns_batch = false;
+//   const NTTDir dir = static_cast<NTTDir>(0); // 0: forward, 1: inverse
+//   const int log_coset_stride = 0;
+//   scalar_t coset_gen;
+//   if (log_coset_stride) {
+//     coset_gen = scalar_t::omega(logn + log_coset_stride);
+//   } else {
+//     coset_gen = scalar_t::one();
+//   }
+
+// // TODO SHANIE : remove
+//   // ICICLE_LOG_INFO << "NTT test: seed=" << seed;
+//   // ICICLE_LOG_INFO << "NTT test: omega=" << scalar_t::omega(logn);
+//   // ICICLE_LOG_INFO << "NTT test:s inplace=" << inplace;
+//   ICICLE_LOG_INFO << "NTT test: logn=" << logn;
+//   // ICICLE_LOG_INFO << "NTT test: log_ntt_domain_size=" << log_ntt_domain_size;
+//   // ICICLE_LOG_INFO << "NTT test: log_batch_size=" << log_batch_size;
+//   // ICICLE_LOG_INFO << "NTT test: columns_batch=" << columns_batch;
+//   // ICICLE_LOG_INFO << "NTT test: ordering=" << int(ordering);
+//   ICICLE_LOG_INFO << "NTT test: dir=" << (dir == NTTDir::kForward ? "forward" : "inverse");
+//   ICICLE_LOG_INFO << "NTT test: log_coset_stride=" << log_coset_stride;
+//   ICICLE_LOG_INFO << "NTT test: coset_gen=" << coset_gen;
+
+
+
+//   const int total_size = N * batch_size;
+//   auto scalars = std::make_unique<TypeParam[]>(total_size);
+//   FieldApiTest<TypeParam>::random_samples(scalars.get(), total_size);
+//   // for (int i = 0; i < total_size; i++) { scalars[i] = scalar_t::from(i); } //FIXME SHANIE: remove
+//   auto out_main = std::make_unique<TypeParam[]>(total_size);
+//   auto out_ref = std::make_unique<TypeParam[]>(total_size);
+//   auto run = [&](const std::string& dev_type, TypeParam* out, const char* msg, bool measure, int iters) {
+//     Device dev = {dev_type, 0};
+//     icicle_set_device(dev);
+//     icicleStreamHandle stream = nullptr;
+//     ICICLE_CHECK(icicle_create_stream(&stream));
+//     auto init_domain_config = default_ntt_init_domain_config();
+//     init_domain_config.stream = stream;
+//     init_domain_config.is_async = false;
+//     ConfigExtension ext;
+//     ext.set(CudaBackendConfig::CUDA_NTT_FAST_TWIDDLES_MODE, true);
+//     init_domain_config.ext = &ext;
+//     auto config = default_ntt_config<scalar_t>();
+//     config.stream = stream;
+//     config.coset_gen = coset_gen;
+//     config.batch_size = batch_size;       // default: 1
+//     config.columns_batch = columns_batch; // default: false
+//     config.ordering = ordering;           // default: kNN
+//     config.are_inputs_on_device = true;
+//     config.are_outputs_on_device = true;
+//     config.is_async = false;
+//     ICICLE_CHECK(ntt_init_domain(scalar_t::omega(log_ntt_domain_size), init_domain_config));
+//     TypeParam *d_in, *d_out;
+//     ICICLE_CHECK(icicle_malloc_async((void**)&d_in, total_size * sizeof(TypeParam), config.stream));
+//     ICICLE_CHECK(icicle_malloc_async((void**)&d_out, total_size * sizeof(TypeParam), config.stream));
+//     ICICLE_CHECK(icicle_copy_to_device_async(d_in, scalars.get(), total_size * sizeof(TypeParam), config.stream));
+//     std::ostringstream oss;
+//     oss << dev_type << " " << msg;
+//     START_TIMER(NTT_sync)
+//     for (int i = 0; i < iters; ++i) {
+//       if (inplace) {
+//         ICICLE_CHECK(ntt(d_in, N, dir, config, d_in));
+//       } else {
+//         ICICLE_CHECK(ntt(d_in, N, dir, config, d_out));
+//       }
+//     }
+//     END_TIMER(NTT_sync, oss.str().c_str(), measure);
+
+//     if (inplace) {
+//       ICICLE_CHECK(icicle_copy_to_host_async(out, d_in, total_size * sizeof(TypeParam), config.stream));
+//     } else {
+//       ICICLE_CHECK(icicle_copy_to_host_async(out, d_out, total_size * sizeof(TypeParam), config.stream));
+//     }
+//     ICICLE_CHECK(icicle_free_async(d_in, config.stream));
+//     ICICLE_CHECK(icicle_free_async(d_out, config.stream));
+//     ICICLE_CHECK(icicle_stream_synchronize(config.stream));
+//     ICICLE_CHECK(icicle_destroy_stream(stream));
+//     ICICLE_CHECK(ntt_release_domain<scalar_t>());
+//   };
+//   // run(s_main_target, out_main.get(), "ntt", false /*=measure*/, 0 /*=iters*/); // warmup
+//   run(s_reference_target, out_ref.get(), "V3ntt", VERBOSE /*=measure*/, 10 /*=iters*/);
+//   run(s_main_target, out_main.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/);
+//   // std::cout << "left:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout <<out_main[total_size-1]<<"]"<< std::endl;
+//   // std::cout << "right:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << out_ref[i] << ", "; } std::cout <<out_ref[total_size-1]<<"]"<< std::endl;
+
+//   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t)));
+// }
+// #endif // NTT
 
 int main(int argc, char** argv)
 {

From de1fcbf47861fda8052a932517bcf0332c7ddf93 Mon Sep 17 00:00:00 2001
From: Shanie Winitz <shanie@ingonyama.com>
Date: Sun, 13 Oct 2024 18:01:09 +0300
Subject: [PATCH 09/43] vecops with batch - documentation

---
 docs/docs/icicle/golang-bindings/vec-ops.md   | 10 ++++-
 docs/docs/icicle/primitives/vec_ops.md        | 45 +++++++++++++++----
 docs/docs/icicle/programmers_guide/general.md |  5 +++
 docs/docs/icicle/rust-bindings/vec-ops.md     | 13 ++++--
 4 files changed, 58 insertions(+), 15 deletions(-)

diff --git a/docs/docs/icicle/golang-bindings/vec-ops.md b/docs/docs/icicle/golang-bindings/vec-ops.md
index e93d9a0a2..e219ec26d 100644
--- a/docs/docs/icicle/golang-bindings/vec-ops.md
+++ b/docs/docs/icicle/golang-bindings/vec-ops.md
@@ -4,8 +4,8 @@
 
 Icicle exposes a number of vector operations which a user can use:
 
-* The VecOps API provides efficient vector operations such as addition, subtraction, and multiplication.
-* MatrixTranspose API allows a user to perform a transpose on a vector representation of a matrix
+* The VecOps API provides efficient vector operations such as addition, subtraction, and multiplication, supporting both single and batched operations.
+* MatrixTranspose API allows a user to perform a transpose on a vector representation of a matrix, with support for batched transpositions.
 
 ## VecOps API Documentation
 
@@ -121,6 +121,8 @@ type VecOpsConfig struct {
 	isBOnDevice      bool
 	isResultOnDevice bool
 	IsAsync          bool
+	batch_size       int
+	columns_batch    bool
 	Ext              config_extension.ConfigExtensionHandler
 }
 ```
@@ -132,6 +134,8 @@ type VecOpsConfig struct {
 - **`isBOnDevice`**: Indicates if vector `b` is located on the device.
 - **`isResultOnDevice`**: Specifies where the result vector should be stored (device or host memory).
 - **`IsAsync`**: Controls whether the vector operation runs asynchronously.
+- **`batch_size`**: Number of vectors (or operations) to process in a batch. Each vector operation will be performed independently on each batch element.
+- **`columns_batch`**: true if the batched vectors are stored as columns in a 2D array (i.e., the vectors are strided in memory as columns of a matrix). If false, the batched vectors are stored contiguously in memory (e.g., as rows or in a flat array).
 - **`Ext`**: Extended configuration for backend.
 
 #### Default Configuration
@@ -148,6 +152,8 @@ This section describes the functionality of the `TransposeMatrix` function used
 
 The function takes a matrix represented as a 1D slice and transposes it, storing the result in another 1D slice.
 
+If VecOpsConfig specifies a batch_size greater than one, the transposition is performed on multiple matrices simultaneously, producing corresponding transposed matrices. The storage arrangement of batched matrices is determined by the columns_batch field in the VecOpsConfig.
+
 ### Function
 
 ```go
diff --git a/docs/docs/icicle/primitives/vec_ops.md b/docs/docs/icicle/primitives/vec_ops.md
index e9e10c1a9..7f546dc16 100644
--- a/docs/docs/icicle/primitives/vec_ops.md
+++ b/docs/docs/icicle/primitives/vec_ops.md
@@ -16,6 +16,8 @@ The `VecOpsConfig` struct is a configuration object used to specify parameters f
 - **`is_b_on_device: bool`**: Indicates whether the second input vector (`b`) is already on the device. If `false`, the vector will be copied from the host to the device. This field is optional.
 - **`is_result_on_device: bool`**: Indicates whether the result should be stored on the device. If `false`, the result will be transferred back to the host.
 - **`is_async: bool`**: Specifies whether the vector operation should be performed asynchronously. When `true`, the operation will not block the CPU, allowing other operations to proceed concurrently. Asynchronous execution requires careful synchronization to ensure data integrity.
+- **`batch_size: int`**: Number of vectors (or operations) to process in a batch. Each vector operation will be performed independently on each batch element.
+- **`columns_batch: bool`**: True if the batched vectors are stored as columns in a 2D array (i.e., the vectors are strided in memory as columns of a matrix). If false, the batched vectors are stored contiguously in memory (e.g., as rows or in a flat array).
 - **`ext: ConfigExtension*`**: Backend-specific extensions.
 
 #### Default Configuration
@@ -28,6 +30,9 @@ static VecOpsConfig default_vec_ops_config() {
       false,   // is_b_on_device
       false,   // is_result_on_device
       false,   // is_async
+      1,       // batch_size
+      false,   // columns_batch
+      nullptr  // ext
     };
     return config;
 }
@@ -35,7 +40,7 @@ static VecOpsConfig default_vec_ops_config() {
 
 ### Element-wise Operations
 
-These functions perform element-wise operations on two input vectors `a` and `b`, producing an output vector.
+These functions perform element-wise operations on two input vectors a and b. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple pairs of vectors simultaneously, producing corresponding output vectors.
 
 #### `vector_add`
 
@@ -90,9 +95,31 @@ template <typename T>
 eIcicleError convert_montgomery(const T* input, uint64_t size, bool is_into, const VecOpsConfig& config, T* output);
 ```
 
+### Reduction operations
+
+These functions perform reduction operations on vectors. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple vectors simultaneously, producing corresponding output values. The storage arrangement of batched vectors is determined by the columns_batch field in the VecOpsConfig.
+
+#### `vector_sum`
+
+Computes the sum of all elements in each vector in a batch.
+
+```cpp
+template <typename T>
+eIcicleError vector_sum(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output);
+```
+
+#### `vector_product`
+
+Computes the product of all elements in each vector in a batch.
+
+```cpp
+template <typename T>
+eIcicleError vector_product(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output);
+```
+
 ### Scalar-Vector Operations
 
-These functions apply a scalar operation to each element of a vector.
+These functions apply a scalar operation to each element of a vector. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple vector-scalar pairs simultaneously, producing corresponding output vectors.
 
 #### `scalar_add_vec / scalar_sub_vec`
 
@@ -123,7 +150,7 @@ eIcicleError scalar_mul_vec(const T* scalar_a, const T* vec_b, uint64_t size, co
 
 ### Matrix Operations
 
-These functions perform operations on matrices.
+These functions perform operations on matrices. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple matrices simultaneously, producing corresponding output matrices.
 
 #### `matrix_transpose`
 
@@ -138,7 +165,7 @@ eIcicleError matrix_transpose(const T* mat_in, uint32_t nof_rows, uint32_t nof_c
 
 #### `bit_reverse`
 
-Reorders the vector elements based on a bit-reversal pattern.
+Reorders the vector elements based on a bit-reversal pattern. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously.
 
 ```cpp
 template <typename T>
@@ -147,16 +174,16 @@ eIcicleError bit_reverse(const T* vec_in, uint64_t size, const VecOpsConfig& con
 
 #### `slice`
 
-Extracts a slice from a vector.
+Extracts a slice from a vector. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously, producing corresponding output vectors.
 
 ```cpp
 template <typename T>
-eIcicleError slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size, const VecOpsConfig& config, T* vec_out);
+eIcicleError slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size_in, uint64_t size_out, const VecOpsConfig& config, T* vec_out);
 ```
 
 #### `highest_non_zero_idx`
 
-Finds the highest non-zero index in a vector.
+Finds the highest non-zero index in a vector. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously.
 
 ```cpp
 template <typename T>
@@ -165,7 +192,7 @@ eIcicleError highest_non_zero_idx(const T* vec_in, uint64_t size, const VecOpsCo
 
 #### `polynomial_eval`
 
-Evaluates a polynomial at given domain points.
+Evaluates a polynomial at given domain points. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously.
 
 ```cpp
 template <typename T>
@@ -174,7 +201,7 @@ eIcicleError polynomial_eval(const T* coeffs, uint64_t coeffs_size, const T* dom
 
 #### `polynomial_division`
 
-Divides two polynomials.
+Divides two polynomials. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously.
 
 ```cpp
 template <typename T>
diff --git a/docs/docs/icicle/programmers_guide/general.md b/docs/docs/icicle/programmers_guide/general.md
index b02cd2f9c..0bef2b850 100644
--- a/docs/docs/icicle/programmers_guide/general.md
+++ b/docs/docs/icicle/programmers_guide/general.md
@@ -21,6 +21,7 @@ The configuration struct allows users to modify settings such as:
 
 - Specifying whether inputs and outputs are on the host or device.
 - Adjusting the data layout for specific optimizations.
+- Setting batching parameters (batch_size and columns_batch) to perform operations on multiple data sets simultaneously.
 - Passing custom options to the backend implementation through an extension mechanism, such as setting the number of CPU cores to use.
 
 ### Example (C++)
@@ -31,6 +32,8 @@ The configuration struct allows users to modify settings such as:
 // Create config struct for vector add
 VecOpsConfig config = default_vec_ops_config();
 // optionally modify the config struct here
+config.batch_size = 4;          // Process 4 vector operations in a batch
+config.columns_batch = true;    // Batched vectors are stored as columns
 
 // Call the API
 eIcicleError err = vector_add(vec_a, vec_b, size, config, vec_res);
@@ -45,6 +48,8 @@ struct VecOpsConfig {
     bool is_b_on_device;       /**< True if `b` is on the device, false if it is not. Default value: false. OPTIONAL. */
     bool is_result_on_device;  /**< If true, the output is preserved on the device, otherwise on the host. Default value: false. */
     bool is_async;             /**< Whether to run the vector operations asynchronously. */
+    int batch_size;            /**< Number of vector operations to process in a batch. Default value: 1. */
+    bool columns_batch;        /**< True if batched vectors are stored as columns; false if stored contiguously. Default value: false. */
     ConfigExtension* ext = nullptr; /**< Backend-specific extension. */
 };
 ```
diff --git a/docs/docs/icicle/rust-bindings/vec-ops.md b/docs/docs/icicle/rust-bindings/vec-ops.md
index 61aa71570..c42caafb5 100644
--- a/docs/docs/icicle/rust-bindings/vec-ops.md
+++ b/docs/docs/icicle/rust-bindings/vec-ops.md
@@ -1,10 +1,10 @@
 # Vector Operations API
 
-Our vector operations API includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory.
+Our vector operations API includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory, as well as batched operations.
 
 ## Vector Operations Configuration
 
-The `VecOpsConfig` struct encapsulates the settings for vector operations, including device context and operation modes.
+The `VecOpsConfig` struct encapsulates the settings for vector operations, including device context, operation modes, and batching parameters.
 
 ### `VecOpsConfig`
 
@@ -17,6 +17,8 @@ pub struct VecOpsConfig {
     pub is_b_on_device: bool,
     pub is_result_on_device: bool,
     pub is_async: bool,
+    pub batch_size: usize,
+    pub columns_batch: bool,
     pub ext: ConfigExtension,
 }
 ```
@@ -28,6 +30,9 @@ pub struct VecOpsConfig {
 - **`is_b_on_device: bool`**: Indicates whether the input b data has been preloaded on the device memory. If `false` inputs will be copied from host to device.
 - **`is_result_on_device: bool`**: Indicates whether the output data is preloaded in device memory. If `false` outputs will be copied from host to device.
 - **`is_async: bool`**: Specifies whether the NTT operation should be performed asynchronously.
+- **`batch_size: usize`**: Number of vector operations to process in a single batch. Each operation will be performed independently on each batch element.
+- **`columns_batch: bool`**: true if the batched vectors are stored as columns in a 2D array (i.e., the vectors are strided in memory as columns of a matrix). If false, the batched vectors are stored contiguously in memory (e.g., as rows or in a flat array).
+
 - **`ext: ConfigExtension`**: extended configuration for backend.
 
 ### Default Configuration
@@ -40,11 +45,11 @@ let cfg = VecOpsConfig::default();
 
 ## Vector Operations
 
-Vector operations are implemented through the `VecOps` trait, providing methods for addition, subtraction, and multiplication of vectors.
+Vector operations are implemented through the `VecOps` trait, providing methods for addition, subtraction, and multiplication of vectors. These methods support both single and batched operations based on the batch_size and columns_batch configurations.
 
 ### Methods
 
-All operations are element-wise operations, and the results placed into the `result` param. These operations are not in place.
+All operations are element-wise operations, and the results placed into the `result` param. These operations are not in place, except for accumulate.
 
 - **`add`**: Computes the element-wise sum of two vectors.
 - **`accumulate`**: Sum input b to a inplace.

From 3a943a59fe86504b8558ed36ce5ef1c556360970 Mon Sep 17 00:00:00 2001
From: Shanie Winitz <shanie@ingonyama.com>
Date: Sun, 13 Oct 2024 18:04:33 +0300
Subject: [PATCH 10/43] formating

---
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp  | 368 ++++++++++--------
 icicle/include/icicle/api/babybear.h          |  63 ++-
 icicle/include/icicle/api/bls12_377.h         |  83 ++--
 icicle/include/icicle/api/bls12_381.h         |  83 ++--
 icicle/include/icicle/api/bn254.h             |  70 ++--
 icicle/include/icicle/api/bw6_761.h           |  74 ++--
 icicle/include/icicle/api/grumpkin.h          |  37 +-
 icicle/include/icicle/api/stark252.h          |  29 +-
 .../include/icicle/backend/vec_ops_backend.h  |  33 +-
 .../include/icicle/fields/complex_extension.h |  26 +-
 .../include/icicle/fields/quartic_extension.h |   8 +-
 .../default_backend/default_poly_backend.h    |  14 +-
 icicle/include/icicle/utils/modifiers.h       |   2 +-
 icicle/include/icicle/vec_ops.h               | 149 +++----
 icicle/src/vec_ops.cpp                        |  64 ++-
 icicle/tests/test_field_api.cpp               | 360 +++++++++--------
 16 files changed, 878 insertions(+), 585 deletions(-)

diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index a56cdc73c..74678fc83 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -51,7 +51,13 @@ class VectorOpTask : public TaskBase
   VectorOpTask() : TaskBase() {}
 
   // Set the operands to execute a task of 2 operands and 1 output and dispatch the task
-  void send_2ops_task(VecOperation operation, const uint32_t nof_operations, const T* op_a, const T* op_b, const uint32_t stride , T* output)
+  void send_2ops_task(
+    VecOperation operation,
+    const uint32_t nof_operations,
+    const T* op_a,
+    const T* op_b,
+    const uint32_t stride,
+    T* output)
   {
     m_operation = operation;
     m_nof_operations = nof_operations;
@@ -72,7 +78,8 @@ class VectorOpTask : public TaskBase
     dispatch();
   }
   // Set the operands to execute a task of 1 operand and dispatch the task
-  void send_intermidiate_res_task(VecOperation operation, const uint64_t stop_index, const T* op_a, const uint64_t stride)
+  void
+  send_intermidiate_res_task(VecOperation operation, const uint64_t stop_index, const T* op_a, const uint64_t stride)
   {
     m_operation = operation;
     m_stop_index = stop_index;
@@ -83,7 +90,13 @@ class VectorOpTask : public TaskBase
 
   // Set the operands for bit_reverse operation and dispatch the task
   void send_bit_reverse_task(
-    VecOperation operation, uint32_t bit_size, uint64_t start_index, const uint32_t nof_operations, const T* op_a, const uint64_t stride, T* output)
+    VecOperation operation,
+    uint32_t bit_size,
+    uint64_t start_index,
+    const uint32_t nof_operations,
+    const T* op_a,
+    const uint64_t stride,
+    T* output)
   {
     m_operation = operation;
     m_bit_size = bit_size;
@@ -96,7 +109,13 @@ class VectorOpTask : public TaskBase
   }
 
   // Set the operands for slice operation and dispatch the task
-  void send_slice_task(VecOperation operation, uint64_t stride, uint64_t stride_out, const uint32_t nof_operations, const T* op_a, T* output)
+  void send_slice_task(
+    VecOperation operation,
+    uint64_t stride,
+    uint64_t stride_out,
+    const uint32_t nof_operations,
+    const T* op_a,
+    T* output)
   {
     m_operation = operation;
     m_nof_operations = nof_operations;
@@ -108,13 +127,22 @@ class VectorOpTask : public TaskBase
   }
 
   // Set the operands for replace_elements operation and dispatch the task
-  void send_replace_elements_task(VecOperation operation, const T* mat_in, const uint32_t nof_operations, std::vector<uint64_t>& start_indices_in_mat, uint64_t start_index, uint32_t log_nof_rows, uint32_t log_nof_cols, const uint32_t stride, T* mat_out)
+  void send_replace_elements_task(
+    VecOperation operation,
+    const T* mat_in,
+    const uint32_t nof_operations,
+    std::vector<uint64_t>& start_indices_in_mat,
+    uint64_t start_index,
+    uint32_t log_nof_rows,
+    uint32_t log_nof_cols,
+    const uint32_t stride,
+    T* mat_out)
   {
     m_operation = operation;
     m_op_a = mat_in;
     m_nof_operations = nof_operations;
     m_start_indices_in_mat = &start_indices_in_mat;
-    m_start_index = start_index; //start index in start_indices vector
+    m_start_index = start_index; // start index in start_indices vector
     m_log_nof_rows = log_nof_rows;
     m_log_nof_cols = log_nof_cols;
     m_stride = stride;
@@ -122,22 +150,27 @@ class VectorOpTask : public TaskBase
     dispatch();
   }
 
-  void send_out_of_place_matrix_transpose_task(VecOperation operation, const T* mat_in, const uint32_t nof_operations, const uint32_t nof_rows, const uint32_t nof_cols, const uint32_t stride, T* mat_out)
-    {
-      m_operation = operation;
-      m_op_a = mat_in;
-      m_nof_operations = nof_operations;
-      m_nof_rows = nof_rows;
-      m_nof_cols = nof_cols;
-      m_stride = stride;
-      m_output = mat_out;
-      dispatch();
-    }
+  void send_out_of_place_matrix_transpose_task(
+    VecOperation operation,
+    const T* mat_in,
+    const uint32_t nof_operations,
+    const uint32_t nof_rows,
+    const uint32_t nof_cols,
+    const uint32_t stride,
+    T* mat_out)
+  {
+    m_operation = operation;
+    m_op_a = mat_in;
+    m_nof_operations = nof_operations;
+    m_nof_rows = nof_rows;
+    m_nof_cols = nof_cols;
+    m_stride = stride;
+    m_output = mat_out;
+    dispatch();
+  }
 
   // Execute the selected function based on m_operation
-  virtual void execute() {
-    (this->*functionPtrs[static_cast<size_t>(m_operation)])(); 
-  }
+  virtual void execute() { (this->*functionPtrs[static_cast<size_t>(m_operation)])(); }
 
 private:
   // Single worker functionality to execute vector add (+)
@@ -238,10 +271,10 @@ class VectorOpTask : public TaskBase
 
       if (m_output == m_op_a) { // inplace calculation
         if (rev_idx < idx) {    // only on of the threads need to work
-          std::swap(m_output[m_stride*idx], m_output[m_stride*rev_idx]);
+          std::swap(m_output[m_stride * idx], m_output[m_stride * rev_idx]);
         }
-      } else {                           // out of place calculation
-        m_output[m_stride*idx] = m_op_a[m_stride*rev_idx]; // set index value
+      } else {                                                 // out of place calculation
+        m_output[m_stride * idx] = m_op_a[m_stride * rev_idx]; // set index value
       }
     }
   }
@@ -255,7 +288,8 @@ class VectorOpTask : public TaskBase
   }
 
   // Function to perform modulus with Mersenne number
-  uint64_t mersenne_mod(uint64_t shifted_idx, uint32_t total_bits) {
+  uint64_t mersenne_mod(uint64_t shifted_idx, uint32_t total_bits)
+  {
     uint64_t mod = (1ULL << total_bits) - 1;
     shifted_idx = (shifted_idx & mod) + (shifted_idx >> total_bits);
     while (shifted_idx >= mod) {
@@ -264,7 +298,6 @@ class VectorOpTask : public TaskBase
     return shifted_idx;
   }
 
-
   // Single worker functionality to execute replace elements
   void replace_elements()
   {
@@ -272,7 +305,7 @@ class VectorOpTask : public TaskBase
     for (uint32_t i = 0; i < m_nof_operations; ++i) {
       uint64_t start_idx = (*m_start_indices_in_mat)[m_start_index + i];
       uint64_t idx = start_idx;
-        T prev = m_op_a[m_stride * idx];
+      T prev = m_op_a[m_stride * idx];
       do {
         uint64_t shifted_idx = idx << m_log_nof_rows;
         uint64_t new_idx = mersenne_mod(shifted_idx, total_bits);
@@ -294,8 +327,6 @@ class VectorOpTask : public TaskBase
     }
   }
 
-
-
   // An array of available function pointers arranged according to the VecOperation enum
   using FunctionPtr = void (VectorOpTask::*)();
   static constexpr std::array<FunctionPtr, static_cast<int>(NOF_OPERATIONS)> functionPtrs = {
@@ -315,28 +346,28 @@ class VectorOpTask : public TaskBase
     &VectorOpTask::replace_elements,        // REPLACE_ELEMENTS
     &VectorOpTask::out_of_place_transpose   // OUT_OF_PLACE_MATRIX_TRANSPOSE
 
-
   };
 
-  VecOperation m_operation; // the operation to execute
-  uint32_t m_nof_operations;     // number of operations to execute for this task
-  const T* m_op_a;          // pointer to operand A. Operand A is a vector, or metrix in case of replace_elements
-  const T* m_op_b;          // pointer to operand B. Operand B is a vector or scalar
-  uint64_t m_start_index;   // index used in bitreverse operation and out of place matrix transpose
-  uint64_t m_stop_index;    // index used in reduce operations and out of place matrix transpose
-  uint32_t m_bit_size;      // use in bitrev operation
-  uint64_t m_stride;        // used to support column batch operations
-  uint64_t m_stride_out;    // used in slice operation
-  T* m_output;              // pointer to the output. Can be a vector, scalar pointer, or a matrix pointer in case of replace_elements
-  uint32_t m_log_nof_rows;  // log of the number of rows in the matrix, used in replace_elements
-  uint32_t m_log_nof_cols;  // log of the number of columns in the matrix, used in replace_elements
-  uint32_t m_nof_rows;      // the number of rows in the matrix, used in out of place matrix transpose
-  uint32_t m_nof_cols;      // the number of columns in the matrix, used in out of place matrix transpose
+  VecOperation m_operation;  // the operation to execute
+  uint32_t m_nof_operations; // number of operations to execute for this task
+  const T* m_op_a;           // pointer to operand A. Operand A is a vector, or metrix in case of replace_elements
+  const T* m_op_b;           // pointer to operand B. Operand B is a vector or scalar
+  uint64_t m_start_index;    // index used in bitreverse operation and out of place matrix transpose
+  uint64_t m_stop_index;     // index used in reduce operations and out of place matrix transpose
+  uint32_t m_bit_size;       // use in bitrev operation
+  uint64_t m_stride;         // used to support column batch operations
+  uint64_t m_stride_out;     // used in slice operation
+  T*
+    m_output; // pointer to the output. Can be a vector, scalar pointer, or a matrix pointer in case of replace_elements
+  uint32_t m_log_nof_rows; // log of the number of rows in the matrix, used in replace_elements
+  uint32_t m_log_nof_cols; // log of the number of columns in the matrix, used in replace_elements
+  uint32_t m_nof_rows;     // the number of rows in the matrix, used in out of place matrix transpose
+  uint32_t m_nof_cols;     // the number of columns in the matrix, used in out of place matrix transpose
   const std::vector<uint64_t>* m_start_indices_in_mat; // Indices used in replace_elements operations
 
-public:  
-  T m_intermidiate_res;     // pointer to the output. Can be a vector or scalar pointer
-  uint64_t m_idx_in_batch;    // index in the batch. Used in intermidiate res tasks
+public:
+  T m_intermidiate_res;    // pointer to the output. Can be a vector or scalar pointer
+  uint64_t m_idx_in_batch; // index in the batch. Used in intermidiate res tasks
 }; // class VectorOpTask
 
 #define NOF_OPERATIONS_PER_TASK 512
@@ -357,10 +388,11 @@ eIcicleError
 cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
-  const uint64_t total_nof_operations = size*config.batch_size;
+  const uint64_t total_nof_operations = size * config.batch_size;
   for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) {
     VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
-    task_p->send_2ops_task(op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), vec_a + i, vec_b + i, 1, output + i);
+    task_p->send_2ops_task(
+      op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), vec_a + i, vec_b + i, 1, output + i);
   }
   task_manager.wait_done();
   return eIcicleError::SUCCESS;
@@ -369,21 +401,27 @@ cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t size,
 // Execute a full task from the type vector = scalar (op) vector
 template <typename T>
 eIcicleError cpu_scalar_vector_op(
-  VecOperation op, const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output)
+  VecOperation op,
+  const T* scalar_a,
+  const T* vec_b,
+  uint64_t size,
+  bool use_single_scalar,
+  const VecOpsConfig& config,
+  T* output)
 {
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
-  const uint64_t total_nof_operations = use_single_scalar? size*config.batch_size : size;
-  const uint32_t stride = (!use_single_scalar && config.columns_batch)? config.batch_size : 1;
-  for (uint32_t idx_in_batch = 0; idx_in_batch < (use_single_scalar? 1 : config.batch_size); idx_in_batch++) {
+  const uint64_t total_nof_operations = use_single_scalar ? size * config.batch_size : size;
+  const uint32_t stride = (!use_single_scalar && config.columns_batch) ? config.batch_size : 1;
+  for (uint32_t idx_in_batch = 0; idx_in_batch < (use_single_scalar ? 1 : config.batch_size); idx_in_batch++) {
     for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) {
       VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
       task_p->send_2ops_task(
-      op,
-      std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i),
-      scalar_a + idx_in_batch,
-      (!use_single_scalar && config.columns_batch)? vec_b + idx_in_batch + i*config.batch_size : vec_b + idx_in_batch*size + i,
-      stride,
-      (!use_single_scalar && config.columns_batch)? output + idx_in_batch + i*config.batch_size : output + idx_in_batch*size + i);
+        op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), scalar_a + idx_in_batch,
+        (!use_single_scalar && config.columns_batch) ? vec_b + idx_in_batch + i * config.batch_size
+                                                     : vec_b + idx_in_batch * size + i,
+        stride,
+        (!use_single_scalar && config.columns_batch) ? output + idx_in_batch + i * config.batch_size
+                                                     : output + idx_in_batch * size + i);
     }
   }
   task_manager.wait_done();
@@ -394,8 +432,8 @@ eIcicleError cpu_scalar_vector_op(
 // Functions to register at the CPU backend
 /*********************************** ADD ***********************************/
 template <typename T>
-eIcicleError
-cpu_vector_add(const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
+eIcicleError cpu_vector_add(
+  const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
   return cpu_2vectors_op(VecOperation::VECTOR_ADD, vec_a, vec_b, size, config, output);
 }
@@ -414,8 +452,8 @@ REGISTER_VECTOR_ACCUMULATE_BACKEND("CPU", cpu_vector_accumulate<scalar_t>);
 
 /*********************************** SUB ***********************************/
 template <typename T>
-eIcicleError
-cpu_vector_sub(const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
+eIcicleError cpu_vector_sub(
+  const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
   return cpu_2vectors_op(VecOperation::VECTOR_SUB, vec_a, vec_b, size, config, output);
 }
@@ -424,8 +462,8 @@ REGISTER_VECTOR_SUB_BACKEND("CPU", cpu_vector_sub<scalar_t>);
 
 /*********************************** MUL ***********************************/
 template <typename T>
-eIcicleError
-cpu_vector_mul(const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
+eIcicleError cpu_vector_mul(
+  const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
   return cpu_2vectors_op(VecOperation::VECTOR_MUL, vec_a, vec_b, size, config, output);
 }
@@ -434,8 +472,8 @@ REGISTER_VECTOR_MUL_BACKEND("CPU", cpu_vector_mul<scalar_t>);
 
 /*********************************** DIV ***********************************/
 template <typename T>
-eIcicleError
-cpu_vector_div(const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
+eIcicleError cpu_vector_div(
+  const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
   return cpu_2vectors_op(VecOperation::VECTOR_DIV, vec_a, vec_b, size, config, output);
 }
@@ -448,16 +486,15 @@ eIcicleError cpu_convert_montgomery(
   const Device& device, const T* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, T* output)
 {
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
-  const uint64_t total_nof_operations = size*config.batch_size;
+  const uint64_t total_nof_operations = size * config.batch_size;
   for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) {
     VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
     task_p->send_1op_task(
-      (is_to_montgomery ? CONVERT_TO_MONTGOMERY : CONVERT_FROM_MONTGOMERY), std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i),
-      input + i, output + i);
+      (is_to_montgomery ? CONVERT_TO_MONTGOMERY : CONVERT_FROM_MONTGOMERY),
+      std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), input + i, output + i);
   }
   task_manager.wait_done();
-  for (uint64_t i = 0; i < size*config.batch_size; i++) {
-  }
+  for (uint64_t i = 0; i < size * config.batch_size; i++) {}
   return eIcicleError::SUCCESS;
 }
 
@@ -482,28 +519,28 @@ eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t size,
   uint64_t idx_in_batch = 0;
   // run until all vector deployed and all tasks completed
   while (true) {
-    VectorOpTask<T>* task_p  = vec_a_offset < size ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task();
-    if (task_p == nullptr) {
-      return eIcicleError::SUCCESS;
-    }
+    VectorOpTask<T>* task_p =
+      vec_a_offset < size ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task();
+    if (task_p == nullptr) { return eIcicleError::SUCCESS; }
     if (task_p->is_completed()) {
-      output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] ? output[task_p->m_idx_in_batch] + task_p->m_intermidiate_res : task_p->m_intermidiate_res;
+      output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch]
+                                         ? output[task_p->m_idx_in_batch] + task_p->m_intermidiate_res
+                                         : task_p->m_intermidiate_res;
       output_initialized[task_p->m_idx_in_batch] = true;
     }
     if (vec_a_offset < size) {
       task_p->m_idx_in_batch = idx_in_batch;
       task_p->send_intermidiate_res_task(
-        VecOperation::VECTOR_SUM,
-        std::min((uint64_t)NOF_OPERATIONS_PER_TASK , size - vec_a_offset),
-        config.columns_batch? vec_a + idx_in_batch + vec_a_offset*config.batch_size : vec_a + idx_in_batch*size + vec_a_offset,
-        config.columns_batch? config.batch_size : 1);
+        VecOperation::VECTOR_SUM, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - vec_a_offset),
+        config.columns_batch ? vec_a + idx_in_batch + vec_a_offset * config.batch_size
+                             : vec_a + idx_in_batch * size + vec_a_offset,
+        config.columns_batch ? config.batch_size : 1);
       idx_in_batch++;
       if (idx_in_batch == config.batch_size) {
         vec_a_offset += NOF_OPERATIONS_PER_TASK;
         idx_in_batch = 0;
       }
-    }
-    else {
+    } else {
       task_p->set_idle();
     }
   }
@@ -513,7 +550,8 @@ REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum<scalar_t>);
 
 /*********************************** PRODUCT ***********************************/
 template <typename T>
-eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output)
+eIcicleError
+cpu_vector_product(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output)
 {
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
   std::vector<bool> output_initialized = std::vector<bool>(config.batch_size, false);
@@ -521,28 +559,28 @@ eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t s
   uint64_t idx_in_batch = 0;
   // run until all vector deployed and all tasks completed
   while (true) {
-    VectorOpTask<T>* task_p  = vec_a_offset < size ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task();
-    if (task_p == nullptr) {
-      return eIcicleError::SUCCESS;
-    }
+    VectorOpTask<T>* task_p =
+      vec_a_offset < size ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task();
+    if (task_p == nullptr) { return eIcicleError::SUCCESS; }
     if (task_p->is_completed()) {
-      output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch] ? output[task_p->m_idx_in_batch] * task_p->m_intermidiate_res : task_p->m_intermidiate_res;
+      output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch]
+                                         ? output[task_p->m_idx_in_batch] * task_p->m_intermidiate_res
+                                         : task_p->m_intermidiate_res;
       output_initialized[task_p->m_idx_in_batch] = true;
     }
     if (vec_a_offset < size) {
       task_p->m_idx_in_batch = idx_in_batch;
       task_p->send_intermidiate_res_task(
-        VecOperation::VECTOR_PRODUCT,
-        std::min((uint64_t)NOF_OPERATIONS_PER_TASK , size - vec_a_offset),
-        config.columns_batch? vec_a + idx_in_batch + vec_a_offset*config.batch_size : vec_a + idx_in_batch*size + vec_a_offset,
-        config.columns_batch? config.batch_size : 1);
+        VecOperation::VECTOR_PRODUCT, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - vec_a_offset),
+        config.columns_batch ? vec_a + idx_in_batch + vec_a_offset * config.batch_size
+                             : vec_a + idx_in_batch * size + vec_a_offset,
+        config.columns_batch ? config.batch_size : 1);
       idx_in_batch++;
       if (idx_in_batch == config.batch_size) {
         vec_a_offset += NOF_OPERATIONS_PER_TASK;
         idx_in_batch = 0;
       }
-    }
-    else {
+    } else {
       task_p->set_idle();
     }
   }
@@ -553,7 +591,13 @@ REGISTER_VECTOR_PRODUCT_BACKEND("CPU", cpu_vector_product<scalar_t>);
 /*********************************** Scalar + Vector***********************************/
 template <typename T>
 eIcicleError cpu_scalar_add(
-  const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output)
+  const Device& device,
+  const T* scalar_a,
+  const T* vec_b,
+  uint64_t size,
+  bool use_single_scalar,
+  const VecOpsConfig& config,
+  T* output)
 {
   return cpu_scalar_vector_op(VecOperation::SCALAR_ADD_VEC, scalar_a, vec_b, size, use_single_scalar, config, output);
 }
@@ -563,7 +607,13 @@ REGISTER_SCALAR_ADD_VEC_BACKEND("CPU", cpu_scalar_add<scalar_t>);
 /*********************************** Scalar - Vector***********************************/
 template <typename T>
 eIcicleError cpu_scalar_sub(
-  const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output)
+  const Device& device,
+  const T* scalar_a,
+  const T* vec_b,
+  uint64_t size,
+  bool use_single_scalar,
+  const VecOpsConfig& config,
+  T* output)
 {
   return cpu_scalar_vector_op(VecOperation::SCALAR_SUB_VEC, scalar_a, vec_b, size, use_single_scalar, config, output);
 }
@@ -573,7 +623,13 @@ REGISTER_SCALAR_SUB_VEC_BACKEND("CPU", cpu_scalar_sub<scalar_t>);
 /*********************************** MUL BY SCALAR***********************************/
 template <typename T>
 eIcicleError cpu_scalar_mul(
-  const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output)
+  const Device& device,
+  const T* scalar_a,
+  const T* vec_b,
+  uint64_t size,
+  bool use_single_scalar,
+  const VecOpsConfig& config,
+  T* output)
 {
   return cpu_scalar_vector_op(VecOperation::SCALAR_MUL_VEC, scalar_a, vec_b, size, use_single_scalar, config, output);
 }
@@ -587,22 +643,19 @@ eIcicleError out_of_place_matrix_transpose(
   const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out)
 {
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
-  uint32_t stride = config.columns_batch? config.batch_size : 1;
+  uint32_t stride = config.columns_batch ? config.batch_size : 1;
   const uint64_t total_elements_one_mat = static_cast<uint64_t>(nof_rows) * nof_cols;
-  const uint32_t NOF_ROWS_PER_TASK = std::min((uint64_t)nof_rows, std::max((uint64_t)(NOF_OPERATIONS_PER_TASK / nof_cols) , (uint64_t)1));
+  const uint32_t NOF_ROWS_PER_TASK =
+    std::min((uint64_t)nof_rows, std::max((uint64_t)(NOF_OPERATIONS_PER_TASK / nof_cols), (uint64_t)1));
   for (uint32_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
-    const T* cur_mat_in = config.columns_batch? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements_one_mat;
-    T* cur_mat_out = config.columns_batch? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements_one_mat;
+    const T* cur_mat_in = config.columns_batch ? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements_one_mat;
+    T* cur_mat_out = config.columns_batch ? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements_one_mat;
     // Perform the matrix transpose
     for (uint32_t i = 0; i < nof_rows; i += NOF_ROWS_PER_TASK) {
       VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
       task_p->send_out_of_place_matrix_transpose_task(
-        OUT_OF_PLACE_MATRIX_TRANSPOSE,
-        cur_mat_in + stride*i*nof_cols,
-        std::min((uint64_t)NOF_ROWS_PER_TASK, (uint64_t)nof_rows - i),
-        nof_rows,
-        nof_cols,
-        stride,
+        OUT_OF_PLACE_MATRIX_TRANSPOSE, cur_mat_in + stride * i * nof_cols,
+        std::min((uint64_t)NOF_ROWS_PER_TASK, (uint64_t)nof_rows - i), nof_rows, nof_cols, stride,
         cur_mat_out + (stride * i));
     }
   }
@@ -610,7 +663,8 @@ eIcicleError out_of_place_matrix_transpose(
   return eIcicleError::SUCCESS;
 }
 
-uint32_t gcd(uint32_t a, uint32_t b) {
+uint32_t gcd(uint32_t a, uint32_t b)
+{
   while (b != 0) {
     uint32_t temp = b;
     b = a % b;
@@ -621,9 +675,20 @@ uint32_t gcd(uint32_t a, uint32_t b) {
 
 // Recursive function to generate all k-ary necklaces and to replace the elements withing the necklaces
 template <typename T>
-void gen_necklace(uint32_t t, uint32_t p, uint32_t k, uint32_t length, std::vector<uint32_t>& necklace, std::vector<uint64_t>& task_indices) {
+void gen_necklace(
+  uint32_t t,
+  uint32_t p,
+  uint32_t k,
+  uint32_t length,
+  std::vector<uint32_t>& necklace,
+  std::vector<uint64_t>& task_indices)
+{
   if (t > length) {
-    if (length % p == 0 && !std::all_of(necklace.begin() + 1, necklace.begin() + length + 1,[first_element = necklace[1]](uint32_t x) { return x == first_element; })) {
+    if (
+      length % p == 0 &&
+      !std::all_of(necklace.begin() + 1, necklace.begin() + length + 1, [first_element = necklace[1]](uint32_t x) {
+        return x == first_element;
+      })) {
       uint32_t start_idx = 0;
       uint64_t multiplier = 1;
       for (int i = length; i >= 1; --i) { // Compute start_idx as the decimal representation of the necklace
@@ -645,17 +710,21 @@ void gen_necklace(uint32_t t, uint32_t p, uint32_t k, uint32_t length, std::vect
 }
 
 template <typename T>
-eIcicleError matrix_transpose_necklaces(const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out){
+eIcicleError matrix_transpose_necklaces(
+  const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out)
+{
   uint32_t log_nof_rows = static_cast<uint32_t>(std::floor(std::log2(nof_rows)));
   uint32_t log_nof_cols = static_cast<uint32_t>(std::floor(std::log2(nof_cols)));
   uint32_t gcd_value = gcd(log_nof_rows, log_nof_cols);
   uint32_t k = 1 << gcd_value; // Base of necklaces
-  uint32_t length = (log_nof_cols + log_nof_rows) / gcd_value; // length of necklaces. Since all are powers of 2, equvalent to (log_nof_cols + log_nof_rows) / gcd_value;
+  uint32_t length =
+    (log_nof_cols + log_nof_rows) / gcd_value; // length of necklaces. Since all are powers of 2, equvalent to
+                                               // (log_nof_cols + log_nof_rows) / gcd_value;
   const uint64_t max_nof_operations = NOF_OPERATIONS_PER_TASK / length;
   const uint64_t total_elements_one_mat = static_cast<uint64_t>(nof_rows) * nof_cols;
 
   std::vector<uint32_t> necklace(length + 1, 0);
-  std::vector<uint64_t> start_indices_in_mat;    // Collect start indices
+  std::vector<uint64_t> start_indices_in_mat; // Collect start indices
   gen_necklace<T>(1, 1, k, length, necklace, start_indices_in_mat);
 
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
@@ -664,22 +733,16 @@ eIcicleError matrix_transpose_necklaces(const T* mat_in, uint32_t nof_rows, uint
     for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
       VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
       task_p->send_replace_elements_task(
-        REPLACE_ELEMENTS,
-        config.columns_batch? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements_one_mat,
-        nof_operations,
-        start_indices_in_mat,
-        i,
-        log_nof_rows,
-        log_nof_cols,
-        config.columns_batch? config.batch_size : 1,
-        config.columns_batch? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements_one_mat);
+        REPLACE_ELEMENTS, config.columns_batch ? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements_one_mat,
+        nof_operations, start_indices_in_mat, i, log_nof_rows, log_nof_cols,
+        config.columns_batch ? config.batch_size : 1,
+        config.columns_batch ? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements_one_mat);
     }
   }
   task_manager.wait_done();
   return eIcicleError::SUCCESS;
 }
 
-
 template <typename T>
 eIcicleError cpu_matrix_transpose(
   const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out)
@@ -690,7 +753,7 @@ eIcicleError cpu_matrix_transpose(
   bool is_power_of_2 = (nof_rows & (nof_rows - 1)) == 0 && (nof_cols & (nof_cols - 1)) == 0;
   bool is_inplace = mat_in == mat_out;
   if (!is_inplace) {
-    return(out_of_place_matrix_transpose(device, mat_in, nof_rows, nof_cols, config, mat_out));
+    return (out_of_place_matrix_transpose(device, mat_in, nof_rows, nof_cols, config, mat_out));
   } else if (is_power_of_2) {
     return (matrix_transpose_necklaces<T>(mat_in, nof_rows, nof_cols, config, mat_out));
   } else {
@@ -721,13 +784,10 @@ cpu_bit_reverse(const Device& device, const T* vec_in, uint64_t size, const VecO
       VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
 
       task_p->send_bit_reverse_task(
-        BIT_REVERSE,
-        logn,
-        i,
-        std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - i),
-        config.columns_batch? vec_in + idx_in_batch : vec_in + idx_in_batch*size,
-        config.columns_batch? config.batch_size : 1,
-        config.columns_batch? vec_out + idx_in_batch: vec_out + idx_in_batch*size);
+        BIT_REVERSE, logn, i, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - i),
+        config.columns_batch ? vec_in + idx_in_batch : vec_in + idx_in_batch * size,
+        config.columns_batch ? config.batch_size : 1,
+        config.columns_batch ? vec_out + idx_in_batch : vec_out + idx_in_batch * size);
     }
   }
   task_manager.wait_done();
@@ -752,21 +812,19 @@ eIcicleError cpu_slice(
   const VecOpsConfig& config,
   T* vec_out)
 {
-
   ICICLE_ASSERT(vec_in != nullptr && vec_out != nullptr) << "Error: Invalid argument - input or output vector is null";
-  ICICLE_ASSERT(offset + (size_out-1) * stride < size_in) << "Error: Invalid argument - slice out of bound";
+  ICICLE_ASSERT(offset + (size_out - 1) * stride < size_in) << "Error: Invalid argument - slice out of bound";
 
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
   for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
     for (uint64_t i = 0; i < size_out; i += NOF_OPERATIONS_PER_TASK) {
       VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
       task_p->send_slice_task(
-        SLICE,
-        config.columns_batch? stride*config.batch_size : stride,
-        config.columns_batch? config.batch_size : 1,
+        SLICE, config.columns_batch ? stride * config.batch_size : stride, config.columns_batch ? config.batch_size : 1,
         std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size_out - i),
-        config.columns_batch? vec_in + idx_in_batch + (offset + i * stride)*config.batch_size : vec_in + idx_in_batch*size_in + offset + i * stride,
-        config.columns_batch? vec_out + idx_in_batch + i*config.batch_size : vec_out + idx_in_batch*size_out + i);
+        config.columns_batch ? vec_in + idx_in_batch + (offset + i * stride) * config.batch_size
+                             : vec_in + idx_in_batch * size_in + offset + i * stride,
+        config.columns_batch ? vec_out + idx_in_batch + i * config.batch_size : vec_out + idx_in_batch * size_out + i);
     }
   }
   task_manager.wait_done();
@@ -783,11 +841,12 @@ template <typename T>
 eIcicleError cpu_highest_non_zero_idx(
   const Device& device, const T* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx /*OUT*/)
 {
-  ICICLE_ASSERT(input && out_idx && size !=0) << "Error: Invalid argument";
-  uint64_t stride = config.columns_batch? config.batch_size : 1;
+  ICICLE_ASSERT(input && out_idx && size != 0) << "Error: Invalid argument";
+  uint64_t stride = config.columns_batch ? config.batch_size : 1;
   for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) {
     out_idx[idx_in_batch] = -1; // zero vector is considered '-1' since 0 would be zero in vec[0]
-    const T* curr_input = config.columns_batch? input + idx_in_batch : input + idx_in_batch * size; // Pointer to the current vector
+    const T* curr_input =
+      config.columns_batch ? input + idx_in_batch : input + idx_in_batch * size; // Pointer to the current vector
     for (int64_t i = size - 1; i >= 0; --i) {
       if (curr_input[i * stride] != T::zero()) {
         out_idx[idx_in_batch] = i;
@@ -800,7 +859,6 @@ eIcicleError cpu_highest_non_zero_idx(
 
 REGISTER_HIGHEST_NON_ZERO_IDX_BACKEND("CPU", cpu_highest_non_zero_idx<scalar_t>);
 
-
 /*********************************** Polynomial evaluation ***********************************/
 
 template <typename T>
@@ -818,12 +876,13 @@ eIcicleError cpu_poly_eval(
   // example: ax^2+bx+c is computed as (1) r=a, (2) r=r*x+b, (3) r=r*x+c
   uint64_t stride = config.columns_batch ? config.batch_size : 1;
   for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) {
-    const T* curr_coeffs = config.columns_batch? coeffs + idx_in_batch : coeffs + idx_in_batch * coeffs_size;
-    T* curr_evals = config.columns_batch? evals + idx_in_batch : evals + idx_in_batch * domain_size;
+    const T* curr_coeffs = config.columns_batch ? coeffs + idx_in_batch : coeffs + idx_in_batch * coeffs_size;
+    T* curr_evals = config.columns_batch ? evals + idx_in_batch : evals + idx_in_batch * domain_size;
     for (uint64_t eval_idx = 0; eval_idx < domain_size; ++eval_idx) {
       curr_evals[eval_idx * stride] = curr_coeffs[(coeffs_size - 1) * stride];
       for (int64_t coeff_idx = coeffs_size - 2; coeff_idx >= 0; --coeff_idx) {
-        curr_evals[eval_idx * stride] = curr_evals[eval_idx * stride] * domain[eval_idx] + curr_coeffs[coeff_idx * stride];
+        curr_evals[eval_idx * stride] =
+          curr_evals[eval_idx * stride] * domain[eval_idx] + curr_coeffs[coeff_idx * stride];
       }
     }
   }
@@ -838,7 +897,7 @@ void school_book_division_step_cpu(T* r, T* q, const T* b, int deg_r, int deg_b,
 {
   int64_t monomial = deg_r - deg_b; // monomial=1 is 'x', monomial=2 is x^2 etc.
 
-  T lc_r = r[deg_r * stride]; // leading coefficient of r
+  T lc_r = r[deg_r * stride];         // leading coefficient of r
   T monomial_coeff = lc_r * lc_b_inv; // lc_r / lc_b
 
   // adding monomial s to q (q=q+s)
@@ -870,22 +929,27 @@ eIcicleError cpu_poly_divide(
 
   // ICICLE_CHECK(icicle_copy_async(r_out, numerator, r_size * config.batch_size * sizeof(T), config.stream));
   // copy numerator to r_out // FIXME should it be copied using icicle_copy_async?
-  for (uint64_t i = 0; i < (numerator_deg+1)*config.batch_size; ++i) {
+  for (uint64_t i = 0; i < (numerator_deg + 1) * config.batch_size; ++i) {
     r_out[i] = numerator[i];
   }
 
-  uint32_t stride = config.columns_batch? config.batch_size : 1;
+  uint32_t stride = config.columns_batch ? config.batch_size : 1;
   auto deg_r = std::make_unique<int64_t[]>(config.batch_size);
   for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) {
-    const T* curr_denumerator = config.columns_batch? denumerator + idx_in_batch : denumerator + idx_in_batch * (denumerator_deg+1); // Pointer to the current vector
-    T* curr_q_out = config.columns_batch? q_out + idx_in_batch : q_out + idx_in_batch * q_size; // Pointer to the current vector
-    T* curr_r_out = config.columns_batch? r_out + idx_in_batch : r_out + idx_in_batch * r_size; // Pointer to the current vector
+    const T* curr_denumerator = config.columns_batch
+                                  ? denumerator + idx_in_batch
+                                  : denumerator + idx_in_batch * (denumerator_deg + 1); // Pointer to the current vector
+    T* curr_q_out =
+      config.columns_batch ? q_out + idx_in_batch : q_out + idx_in_batch * q_size; // Pointer to the current vector
+    T* curr_r_out =
+      config.columns_batch ? r_out + idx_in_batch : r_out + idx_in_batch * r_size; // Pointer to the current vector
     // invert largest coeff of b
     const T& lc_b_inv = T::inverse(curr_denumerator[denumerator_deg * stride]);
     deg_r[idx_in_batch] = numerator_deg;
     while (deg_r[idx_in_batch] >= denumerator_deg) {
       // each iteration is removing the largest monomial in r until deg(r)<deg(b)
-      school_book_division_step_cpu(curr_r_out, curr_q_out, curr_denumerator, deg_r[idx_in_batch], denumerator_deg, lc_b_inv, stride);
+      school_book_division_step_cpu(
+        curr_r_out, curr_q_out, curr_denumerator, deg_r[idx_in_batch], denumerator_deg, lc_b_inv, stride);
       // compute degree of r
       cpu_highest_non_zero_idx(device, r_out, r_size, config, deg_r.get());
     }
diff --git a/icicle/include/icicle/api/babybear.h b/icicle/include/icicle/api/babybear.h
index 0e329f4d1..553862125 100644
--- a/icicle/include/icicle/api/babybear.h
+++ b/icicle/include/icicle/api/babybear.h
@@ -14,31 +14,53 @@ extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size)
 extern "C" void babybear_scalar_convert_montgomery(
   const babybear::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, babybear::scalar_t* output);
 
-extern "C" eIcicleError babybear_ntt_init_domain(
-  babybear::scalar_t* primitive_root, const NTTInitDomainConfig* config);
+extern "C" eIcicleError babybear_ntt_init_domain(babybear::scalar_t* primitive_root, const NTTInitDomainConfig* config);
 
 extern "C" eIcicleError babybear_ntt(
-  const babybear::scalar_t* input, int size, NTTDir dir, const NTTConfig<babybear::scalar_t>* config, babybear::scalar_t* output);
+  const babybear::scalar_t* input,
+  int size,
+  NTTDir dir,
+  const NTTConfig<babybear::scalar_t>* config,
+  babybear::scalar_t* output);
 
 extern "C" eIcicleError babybear_ntt_release_domain();
 
 extern "C" eIcicleError babybear_extension_ntt(
-  const babybear::extension_t* input, int size, NTTDir dir, const NTTConfig<babybear::scalar_t>* config, babybear::extension_t* output);
-
+  const babybear::extension_t* input,
+  int size,
+  NTTDir dir,
+  const NTTConfig<babybear::scalar_t>* config,
+  babybear::extension_t* output);
 
 extern "C" void babybear_extension_generate_scalars(babybear::extension_t* scalars, int size);
 
-extern "C" eIcicleError babybear_extension_scalar_convert_montgomery(  
-  const babybear::extension_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, babybear::extension_t* output);
+extern "C" eIcicleError babybear_extension_scalar_convert_montgomery(
+  const babybear::extension_t* input,
+  uint64_t size,
+  bool is_into,
+  const VecOpsConfig* config,
+  babybear::extension_t* output);
 
 extern "C" eIcicleError babybear_extension_vector_mul(
-  const babybear::extension_t* vec_a, const babybear::extension_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::extension_t* result);
+  const babybear::extension_t* vec_a,
+  const babybear::extension_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  babybear::extension_t* result);
 
 extern "C" eIcicleError babybear_extension_vector_add(
-  const babybear::extension_t* vec_a, const babybear::extension_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::extension_t* result);
+  const babybear::extension_t* vec_a,
+  const babybear::extension_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  babybear::extension_t* result);
 
 extern "C" eIcicleError babybear_extension_vector_sub(
-  const babybear::extension_t* vec_a, const babybear::extension_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::extension_t* result);
+  const babybear::extension_t* vec_a,
+  const babybear::extension_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  babybear::extension_t* result);
 
 extern "C" eIcicleError babybear_extension_matrix_transpose(
   const babybear::extension_t* input,
@@ -50,15 +72,26 @@ extern "C" eIcicleError babybear_extension_matrix_transpose(
 extern "C" eIcicleError babybear_extension_bit_reverse(
   const babybear::extension_t* input, uint64_t n, const VecOpsConfig* config, babybear::extension_t* output);
 
-
 extern "C" eIcicleError babybear_vector_mul(
-  const babybear::scalar_t* vec_a, const babybear::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* result);
+  const babybear::scalar_t* vec_a,
+  const babybear::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  babybear::scalar_t* result);
 
 extern "C" eIcicleError babybear_vector_add(
-  const babybear::scalar_t* vec_a, const babybear::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* result);
+  const babybear::scalar_t* vec_a,
+  const babybear::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  babybear::scalar_t* result);
 
 extern "C" eIcicleError babybear_vector_sub(
-  const babybear::scalar_t* vec_a, const babybear::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* result);
+  const babybear::scalar_t* vec_a,
+  const babybear::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  babybear::scalar_t* result);
 
 extern "C" eIcicleError babybear_matrix_transpose(
   const babybear::scalar_t* input,
@@ -69,5 +102,3 @@ extern "C" eIcicleError babybear_matrix_transpose(
 
 extern "C" eIcicleError babybear_bit_reverse(
   const babybear::scalar_t* input, uint64_t n, const VecOpsConfig* config, babybear::scalar_t* output);
-
-
diff --git a/icicle/include/icicle/api/bls12_377.h b/icicle/include/icicle/api/bls12_377.h
index c617dcaf9..3bbb17ef5 100644
--- a/icicle/include/icicle/api/bls12_377.h
+++ b/icicle/include/icicle/api/bls12_377.h
@@ -19,23 +19,35 @@ extern "C" void bls12_377_g2_generate_projective_points(bls12_377::g2_projective
 extern "C" void bls12_377_g2_generate_affine_points(bls12_377::g2_affine_t* points, int size);
 
 extern "C" eIcicleError bls12_377_g2_affine_convert_montgomery(
-  const bls12_377::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::g2_affine_t* output);
+  const bls12_377::g2_affine_t* input,
+  size_t n,
+  bool is_into,
+  const VecOpsConfig* config,
+  bls12_377::g2_affine_t* output);
 
 extern "C" eIcicleError bls12_377_g2_projective_convert_montgomery(
-  const bls12_377::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::g2_projective_t* output);  
+  const bls12_377::g2_projective_t* input,
+  size_t n,
+  bool is_into,
+  const VecOpsConfig* config,
+  bls12_377::g2_projective_t* output);
 
 extern "C" eIcicleError bls12_377_ecntt(
-  const bls12_377::projective_t* input, int size, NTTDir dir, const NTTConfig<bls12_377::scalar_t>* config, bls12_377::projective_t* output);
-
+  const bls12_377::projective_t* input,
+  int size,
+  NTTDir dir,
+  const NTTConfig<bls12_377::scalar_t>* config,
+  bls12_377::projective_t* output);
 
 extern "C" eIcicleError bls12_377_precompute_msm_bases(
-  const bls12_377::affine_t* bases,
-  int nof_bases,
-  const MSMConfig* config,
-  bls12_377::affine_t* output_bases);
+  const bls12_377::affine_t* bases, int nof_bases, const MSMConfig* config, bls12_377::affine_t* output_bases);
 
 extern "C" eIcicleError bls12_377_msm(
-  const bls12_377::scalar_t* scalars, const bls12_377::affine_t* points, int msm_size, const MSMConfig* config, bls12_377::projective_t* out);
+  const bls12_377::scalar_t* scalars,
+  const bls12_377::affine_t* points,
+  int msm_size,
+  const MSMConfig* config,
+  bls12_377::projective_t* out);
 
 extern "C" bool bls12_377_eq(bls12_377::projective_t* point1, bls12_377::projective_t* point2);
 
@@ -49,38 +61,63 @@ extern "C" eIcicleError bls12_377_affine_convert_montgomery(
   const bls12_377::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::affine_t* output);
 
 extern "C" eIcicleError bls12_377_projective_convert_montgomery(
-  const bls12_377::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_377::projective_t* output);  
+  const bls12_377::projective_t* input,
+  size_t n,
+  bool is_into,
+  const VecOpsConfig* config,
+  bls12_377::projective_t* output);
 
 extern "C" eIcicleError bls12_377_g2_precompute_msm_bases(
-  const bls12_377::g2_affine_t* bases,
-  int nof_bases,
-  const MSMConfig* config,
-  bls12_377::g2_affine_t* output_bases);
+  const bls12_377::g2_affine_t* bases, int nof_bases, const MSMConfig* config, bls12_377::g2_affine_t* output_bases);
 
 extern "C" eIcicleError bls12_377_g2_msm(
-  const bls12_377::scalar_t* scalars, const bls12_377::g2_affine_t* points, int msm_size, const MSMConfig* config, bls12_377::g2_projective_t* out);
+  const bls12_377::scalar_t* scalars,
+  const bls12_377::g2_affine_t* points,
+  int msm_size,
+  const MSMConfig* config,
+  bls12_377::g2_projective_t* out);
 
 extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size);
 
 extern "C" void bls12_377_scalar_convert_montgomery(
-  const bls12_377::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bls12_377::scalar_t* output);
+  const bls12_377::scalar_t* input,
+  uint64_t size,
+  bool is_into,
+  const VecOpsConfig* config,
+  bls12_377::scalar_t* output);
 
-extern "C" eIcicleError bls12_377_ntt_init_domain(
-  bls12_377::scalar_t* primitive_root, const NTTInitDomainConfig* config);
+extern "C" eIcicleError
+bls12_377_ntt_init_domain(bls12_377::scalar_t* primitive_root, const NTTInitDomainConfig* config);
 
 extern "C" eIcicleError bls12_377_ntt(
-  const bls12_377::scalar_t* input, int size, NTTDir dir, const NTTConfig<bls12_377::scalar_t>* config, bls12_377::scalar_t* output);
+  const bls12_377::scalar_t* input,
+  int size,
+  NTTDir dir,
+  const NTTConfig<bls12_377::scalar_t>* config,
+  bls12_377::scalar_t* output);
 
 extern "C" eIcicleError bls12_377_ntt_release_domain();
 
 extern "C" eIcicleError bls12_377_vector_mul(
-  const bls12_377::scalar_t* vec_a, const bls12_377::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bls12_377::scalar_t* result);
+  const bls12_377::scalar_t* vec_a,
+  const bls12_377::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  bls12_377::scalar_t* result);
 
 extern "C" eIcicleError bls12_377_vector_add(
-  const bls12_377::scalar_t* vec_a, const bls12_377::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bls12_377::scalar_t* result);
+  const bls12_377::scalar_t* vec_a,
+  const bls12_377::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  bls12_377::scalar_t* result);
 
 extern "C" eIcicleError bls12_377_vector_sub(
-  const bls12_377::scalar_t* vec_a, const bls12_377::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bls12_377::scalar_t* result);
+  const bls12_377::scalar_t* vec_a,
+  const bls12_377::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  bls12_377::scalar_t* result);
 
 extern "C" eIcicleError bls12_377_matrix_transpose(
   const bls12_377::scalar_t* input,
@@ -91,5 +128,3 @@ extern "C" eIcicleError bls12_377_matrix_transpose(
 
 extern "C" eIcicleError bls12_377_bit_reverse(
   const bls12_377::scalar_t* input, uint64_t n, const VecOpsConfig* config, bls12_377::scalar_t* output);
-
-
diff --git a/icicle/include/icicle/api/bls12_381.h b/icicle/include/icicle/api/bls12_381.h
index 361731586..b62e6a61a 100644
--- a/icicle/include/icicle/api/bls12_381.h
+++ b/icicle/include/icicle/api/bls12_381.h
@@ -19,23 +19,35 @@ extern "C" void bls12_381_g2_generate_projective_points(bls12_381::g2_projective
 extern "C" void bls12_381_g2_generate_affine_points(bls12_381::g2_affine_t* points, int size);
 
 extern "C" eIcicleError bls12_381_g2_affine_convert_montgomery(
-  const bls12_381::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::g2_affine_t* output);
+  const bls12_381::g2_affine_t* input,
+  size_t n,
+  bool is_into,
+  const VecOpsConfig* config,
+  bls12_381::g2_affine_t* output);
 
 extern "C" eIcicleError bls12_381_g2_projective_convert_montgomery(
-  const bls12_381::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::g2_projective_t* output);  
+  const bls12_381::g2_projective_t* input,
+  size_t n,
+  bool is_into,
+  const VecOpsConfig* config,
+  bls12_381::g2_projective_t* output);
 
 extern "C" eIcicleError bls12_381_ecntt(
-  const bls12_381::projective_t* input, int size, NTTDir dir, const NTTConfig<bls12_381::scalar_t>* config, bls12_381::projective_t* output);
-
+  const bls12_381::projective_t* input,
+  int size,
+  NTTDir dir,
+  const NTTConfig<bls12_381::scalar_t>* config,
+  bls12_381::projective_t* output);
 
 extern "C" eIcicleError bls12_381_precompute_msm_bases(
-  const bls12_381::affine_t* bases,
-  int nof_bases,
-  const MSMConfig* config,
-  bls12_381::affine_t* output_bases);
+  const bls12_381::affine_t* bases, int nof_bases, const MSMConfig* config, bls12_381::affine_t* output_bases);
 
 extern "C" eIcicleError bls12_381_msm(
-  const bls12_381::scalar_t* scalars, const bls12_381::affine_t* points, int msm_size, const MSMConfig* config, bls12_381::projective_t* out);
+  const bls12_381::scalar_t* scalars,
+  const bls12_381::affine_t* points,
+  int msm_size,
+  const MSMConfig* config,
+  bls12_381::projective_t* out);
 
 extern "C" bool bls12_381_eq(bls12_381::projective_t* point1, bls12_381::projective_t* point2);
 
@@ -49,38 +61,63 @@ extern "C" eIcicleError bls12_381_affine_convert_montgomery(
   const bls12_381::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::affine_t* output);
 
 extern "C" eIcicleError bls12_381_projective_convert_montgomery(
-  const bls12_381::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bls12_381::projective_t* output);  
+  const bls12_381::projective_t* input,
+  size_t n,
+  bool is_into,
+  const VecOpsConfig* config,
+  bls12_381::projective_t* output);
 
 extern "C" eIcicleError bls12_381_g2_precompute_msm_bases(
-  const bls12_381::g2_affine_t* bases,
-  int nof_bases,
-  const MSMConfig* config,
-  bls12_381::g2_affine_t* output_bases);
+  const bls12_381::g2_affine_t* bases, int nof_bases, const MSMConfig* config, bls12_381::g2_affine_t* output_bases);
 
 extern "C" eIcicleError bls12_381_g2_msm(
-  const bls12_381::scalar_t* scalars, const bls12_381::g2_affine_t* points, int msm_size, const MSMConfig* config, bls12_381::g2_projective_t* out);
+  const bls12_381::scalar_t* scalars,
+  const bls12_381::g2_affine_t* points,
+  int msm_size,
+  const MSMConfig* config,
+  bls12_381::g2_projective_t* out);
 
 extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size);
 
 extern "C" void bls12_381_scalar_convert_montgomery(
-  const bls12_381::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bls12_381::scalar_t* output);
+  const bls12_381::scalar_t* input,
+  uint64_t size,
+  bool is_into,
+  const VecOpsConfig* config,
+  bls12_381::scalar_t* output);
 
-extern "C" eIcicleError bls12_381_ntt_init_domain(
-  bls12_381::scalar_t* primitive_root, const NTTInitDomainConfig* config);
+extern "C" eIcicleError
+bls12_381_ntt_init_domain(bls12_381::scalar_t* primitive_root, const NTTInitDomainConfig* config);
 
 extern "C" eIcicleError bls12_381_ntt(
-  const bls12_381::scalar_t* input, int size, NTTDir dir, const NTTConfig<bls12_381::scalar_t>* config, bls12_381::scalar_t* output);
+  const bls12_381::scalar_t* input,
+  int size,
+  NTTDir dir,
+  const NTTConfig<bls12_381::scalar_t>* config,
+  bls12_381::scalar_t* output);
 
 extern "C" eIcicleError bls12_381_ntt_release_domain();
 
 extern "C" eIcicleError bls12_381_vector_mul(
-  const bls12_381::scalar_t* vec_a, const bls12_381::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bls12_381::scalar_t* result);
+  const bls12_381::scalar_t* vec_a,
+  const bls12_381::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  bls12_381::scalar_t* result);
 
 extern "C" eIcicleError bls12_381_vector_add(
-  const bls12_381::scalar_t* vec_a, const bls12_381::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bls12_381::scalar_t* result);
+  const bls12_381::scalar_t* vec_a,
+  const bls12_381::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  bls12_381::scalar_t* result);
 
 extern "C" eIcicleError bls12_381_vector_sub(
-  const bls12_381::scalar_t* vec_a, const bls12_381::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bls12_381::scalar_t* result);
+  const bls12_381::scalar_t* vec_a,
+  const bls12_381::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  bls12_381::scalar_t* result);
 
 extern "C" eIcicleError bls12_381_matrix_transpose(
   const bls12_381::scalar_t* input,
@@ -91,5 +128,3 @@ extern "C" eIcicleError bls12_381_matrix_transpose(
 
 extern "C" eIcicleError bls12_381_bit_reverse(
   const bls12_381::scalar_t* input, uint64_t n, const VecOpsConfig* config, bls12_381::scalar_t* output);
-
-
diff --git a/icicle/include/icicle/api/bn254.h b/icicle/include/icicle/api/bn254.h
index 928cb639e..f3aad8d53 100644
--- a/icicle/include/icicle/api/bn254.h
+++ b/icicle/include/icicle/api/bn254.h
@@ -22,20 +22,28 @@ extern "C" eIcicleError bn254_g2_affine_convert_montgomery(
   const bn254::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::g2_affine_t* output);
 
 extern "C" eIcicleError bn254_g2_projective_convert_montgomery(
-  const bn254::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::g2_projective_t* output);  
+  const bn254::g2_projective_t* input,
+  size_t n,
+  bool is_into,
+  const VecOpsConfig* config,
+  bn254::g2_projective_t* output);
 
 extern "C" eIcicleError bn254_ecntt(
-  const bn254::projective_t* input, int size, NTTDir dir, const NTTConfig<bn254::scalar_t>* config, bn254::projective_t* output);
-
+  const bn254::projective_t* input,
+  int size,
+  NTTDir dir,
+  const NTTConfig<bn254::scalar_t>* config,
+  bn254::projective_t* output);
 
 extern "C" eIcicleError bn254_precompute_msm_bases(
-  const bn254::affine_t* bases,
-  int nof_bases,
-  const MSMConfig* config,
-  bn254::affine_t* output_bases);
+  const bn254::affine_t* bases, int nof_bases, const MSMConfig* config, bn254::affine_t* output_bases);
 
 extern "C" eIcicleError bn254_msm(
-  const bn254::scalar_t* scalars, const bn254::affine_t* points, int msm_size, const MSMConfig* config, bn254::projective_t* out);
+  const bn254::scalar_t* scalars,
+  const bn254::affine_t* points,
+  int msm_size,
+  const MSMConfig* config,
+  bn254::projective_t* out);
 
 extern "C" bool bn254_eq(bn254::projective_t* point1, bn254::projective_t* point2);
 
@@ -49,38 +57,54 @@ extern "C" eIcicleError bn254_affine_convert_montgomery(
   const bn254::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::affine_t* output);
 
 extern "C" eIcicleError bn254_projective_convert_montgomery(
-  const bn254::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::projective_t* output);  
+  const bn254::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bn254::projective_t* output);
 
 extern "C" eIcicleError bn254_g2_precompute_msm_bases(
-  const bn254::g2_affine_t* bases,
-  int nof_bases,
-  const MSMConfig* config,
-  bn254::g2_affine_t* output_bases);
+  const bn254::g2_affine_t* bases, int nof_bases, const MSMConfig* config, bn254::g2_affine_t* output_bases);
 
 extern "C" eIcicleError bn254_g2_msm(
-  const bn254::scalar_t* scalars, const bn254::g2_affine_t* points, int msm_size, const MSMConfig* config, bn254::g2_projective_t* out);
+  const bn254::scalar_t* scalars,
+  const bn254::g2_affine_t* points,
+  int msm_size,
+  const MSMConfig* config,
+  bn254::g2_projective_t* out);
 
 extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size);
 
 extern "C" void bn254_scalar_convert_montgomery(
   const bn254::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bn254::scalar_t* output);
 
-extern "C" eIcicleError bn254_ntt_init_domain(
-  bn254::scalar_t* primitive_root, const NTTInitDomainConfig* config);
+extern "C" eIcicleError bn254_ntt_init_domain(bn254::scalar_t* primitive_root, const NTTInitDomainConfig* config);
 
 extern "C" eIcicleError bn254_ntt(
-  const bn254::scalar_t* input, int size, NTTDir dir, const NTTConfig<bn254::scalar_t>* config, bn254::scalar_t* output);
+  const bn254::scalar_t* input,
+  int size,
+  NTTDir dir,
+  const NTTConfig<bn254::scalar_t>* config,
+  bn254::scalar_t* output);
 
 extern "C" eIcicleError bn254_ntt_release_domain();
 
 extern "C" eIcicleError bn254_vector_mul(
-  const bn254::scalar_t* vec_a, const bn254::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result);
+  const bn254::scalar_t* vec_a,
+  const bn254::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  bn254::scalar_t* result);
 
 extern "C" eIcicleError bn254_vector_add(
-  const bn254::scalar_t* vec_a, const bn254::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result);
+  const bn254::scalar_t* vec_a,
+  const bn254::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  bn254::scalar_t* result);
 
 extern "C" eIcicleError bn254_vector_sub(
-  const bn254::scalar_t* vec_a, const bn254::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result);
+  const bn254::scalar_t* vec_a,
+  const bn254::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  bn254::scalar_t* result);
 
 extern "C" eIcicleError bn254_matrix_transpose(
   const bn254::scalar_t* input,
@@ -89,7 +113,5 @@ extern "C" eIcicleError bn254_matrix_transpose(
   const VecOpsConfig* config,
   bn254::scalar_t* output);
 
-extern "C" eIcicleError bn254_bit_reverse(
-  const bn254::scalar_t* input, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* output);
-
-
+extern "C" eIcicleError
+bn254_bit_reverse(const bn254::scalar_t* input, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* output);
diff --git a/icicle/include/icicle/api/bw6_761.h b/icicle/include/icicle/api/bw6_761.h
index 6b48606a2..0147091e5 100644
--- a/icicle/include/icicle/api/bw6_761.h
+++ b/icicle/include/icicle/api/bw6_761.h
@@ -22,20 +22,28 @@ extern "C" eIcicleError bw6_761_g2_affine_convert_montgomery(
   const bw6_761::g2_affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::g2_affine_t* output);
 
 extern "C" eIcicleError bw6_761_g2_projective_convert_montgomery(
-  const bw6_761::g2_projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::g2_projective_t* output);  
+  const bw6_761::g2_projective_t* input,
+  size_t n,
+  bool is_into,
+  const VecOpsConfig* config,
+  bw6_761::g2_projective_t* output);
 
 extern "C" eIcicleError bw6_761_ecntt(
-  const bw6_761::projective_t* input, int size, NTTDir dir, const NTTConfig<bw6_761::scalar_t>* config, bw6_761::projective_t* output);
-
+  const bw6_761::projective_t* input,
+  int size,
+  NTTDir dir,
+  const NTTConfig<bw6_761::scalar_t>* config,
+  bw6_761::projective_t* output);
 
 extern "C" eIcicleError bw6_761_precompute_msm_bases(
-  const bw6_761::affine_t* bases,
-  int nof_bases,
-  const MSMConfig* config,
-  bw6_761::affine_t* output_bases);
+  const bw6_761::affine_t* bases, int nof_bases, const MSMConfig* config, bw6_761::affine_t* output_bases);
 
 extern "C" eIcicleError bw6_761_msm(
-  const bw6_761::scalar_t* scalars, const bw6_761::affine_t* points, int msm_size, const MSMConfig* config, bw6_761::projective_t* out);
+  const bw6_761::scalar_t* scalars,
+  const bw6_761::affine_t* points,
+  int msm_size,
+  const MSMConfig* config,
+  bw6_761::projective_t* out);
 
 extern "C" bool bw6_761_eq(bw6_761::projective_t* point1, bw6_761::projective_t* point2);
 
@@ -49,38 +57,58 @@ extern "C" eIcicleError bw6_761_affine_convert_montgomery(
   const bw6_761::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::affine_t* output);
 
 extern "C" eIcicleError bw6_761_projective_convert_montgomery(
-  const bw6_761::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, bw6_761::projective_t* output);  
+  const bw6_761::projective_t* input,
+  size_t n,
+  bool is_into,
+  const VecOpsConfig* config,
+  bw6_761::projective_t* output);
 
 extern "C" eIcicleError bw6_761_g2_precompute_msm_bases(
-  const bw6_761::g2_affine_t* bases,
-  int nof_bases,
-  const MSMConfig* config,
-  bw6_761::g2_affine_t* output_bases);
+  const bw6_761::g2_affine_t* bases, int nof_bases, const MSMConfig* config, bw6_761::g2_affine_t* output_bases);
 
 extern "C" eIcicleError bw6_761_g2_msm(
-  const bw6_761::scalar_t* scalars, const bw6_761::g2_affine_t* points, int msm_size, const MSMConfig* config, bw6_761::g2_projective_t* out);
+  const bw6_761::scalar_t* scalars,
+  const bw6_761::g2_affine_t* points,
+  int msm_size,
+  const MSMConfig* config,
+  bw6_761::g2_projective_t* out);
 
 extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size);
 
 extern "C" void bw6_761_scalar_convert_montgomery(
   const bw6_761::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, bw6_761::scalar_t* output);
 
-extern "C" eIcicleError bw6_761_ntt_init_domain(
-  bw6_761::scalar_t* primitive_root, const NTTInitDomainConfig* config);
+extern "C" eIcicleError bw6_761_ntt_init_domain(bw6_761::scalar_t* primitive_root, const NTTInitDomainConfig* config);
 
 extern "C" eIcicleError bw6_761_ntt(
-  const bw6_761::scalar_t* input, int size, NTTDir dir, const NTTConfig<bw6_761::scalar_t>* config, bw6_761::scalar_t* output);
+  const bw6_761::scalar_t* input,
+  int size,
+  NTTDir dir,
+  const NTTConfig<bw6_761::scalar_t>* config,
+  bw6_761::scalar_t* output);
 
 extern "C" eIcicleError bw6_761_ntt_release_domain();
 
 extern "C" eIcicleError bw6_761_vector_mul(
-  const bw6_761::scalar_t* vec_a, const bw6_761::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bw6_761::scalar_t* result);
+  const bw6_761::scalar_t* vec_a,
+  const bw6_761::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  bw6_761::scalar_t* result);
 
 extern "C" eIcicleError bw6_761_vector_add(
-  const bw6_761::scalar_t* vec_a, const bw6_761::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bw6_761::scalar_t* result);
+  const bw6_761::scalar_t* vec_a,
+  const bw6_761::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  bw6_761::scalar_t* result);
 
 extern "C" eIcicleError bw6_761_vector_sub(
-  const bw6_761::scalar_t* vec_a, const bw6_761::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, bw6_761::scalar_t* result);
+  const bw6_761::scalar_t* vec_a,
+  const bw6_761::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  bw6_761::scalar_t* result);
 
 extern "C" eIcicleError bw6_761_matrix_transpose(
   const bw6_761::scalar_t* input,
@@ -89,7 +117,5 @@ extern "C" eIcicleError bw6_761_matrix_transpose(
   const VecOpsConfig* config,
   bw6_761::scalar_t* output);
 
-extern "C" eIcicleError bw6_761_bit_reverse(
-  const bw6_761::scalar_t* input, uint64_t n, const VecOpsConfig* config, bw6_761::scalar_t* output);
-
-
+extern "C" eIcicleError
+bw6_761_bit_reverse(const bw6_761::scalar_t* input, uint64_t n, const VecOpsConfig* config, bw6_761::scalar_t* output);
diff --git a/icicle/include/icicle/api/grumpkin.h b/icicle/include/icicle/api/grumpkin.h
index 42b1b2195..4c308e5c3 100644
--- a/icicle/include/icicle/api/grumpkin.h
+++ b/icicle/include/icicle/api/grumpkin.h
@@ -10,13 +10,14 @@
 #include "icicle/vec_ops.h"
 
 extern "C" eIcicleError grumpkin_precompute_msm_bases(
-  const grumpkin::affine_t* bases,
-  int nof_bases,
-  const MSMConfig* config,
-  grumpkin::affine_t* output_bases);
+  const grumpkin::affine_t* bases, int nof_bases, const MSMConfig* config, grumpkin::affine_t* output_bases);
 
 extern "C" eIcicleError grumpkin_msm(
-  const grumpkin::scalar_t* scalars, const grumpkin::affine_t* points, int msm_size, const MSMConfig* config, grumpkin::projective_t* out);
+  const grumpkin::scalar_t* scalars,
+  const grumpkin::affine_t* points,
+  int msm_size,
+  const MSMConfig* config,
+  grumpkin::projective_t* out);
 
 extern "C" bool grumpkin_eq(grumpkin::projective_t* point1, grumpkin::projective_t* point2);
 
@@ -30,7 +31,11 @@ extern "C" eIcicleError grumpkin_affine_convert_montgomery(
   const grumpkin::affine_t* input, size_t n, bool is_into, const VecOpsConfig* config, grumpkin::affine_t* output);
 
 extern "C" eIcicleError grumpkin_projective_convert_montgomery(
-  const grumpkin::projective_t* input, size_t n, bool is_into, const VecOpsConfig* config, grumpkin::projective_t* output);  
+  const grumpkin::projective_t* input,
+  size_t n,
+  bool is_into,
+  const VecOpsConfig* config,
+  grumpkin::projective_t* output);
 
 extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size);
 
@@ -38,13 +43,25 @@ extern "C" void grumpkin_scalar_convert_montgomery(
   const grumpkin::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, grumpkin::scalar_t* output);
 
 extern "C" eIcicleError grumpkin_vector_mul(
-  const grumpkin::scalar_t* vec_a, const grumpkin::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, grumpkin::scalar_t* result);
+  const grumpkin::scalar_t* vec_a,
+  const grumpkin::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  grumpkin::scalar_t* result);
 
 extern "C" eIcicleError grumpkin_vector_add(
-  const grumpkin::scalar_t* vec_a, const grumpkin::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, grumpkin::scalar_t* result);
+  const grumpkin::scalar_t* vec_a,
+  const grumpkin::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  grumpkin::scalar_t* result);
 
 extern "C" eIcicleError grumpkin_vector_sub(
-  const grumpkin::scalar_t* vec_a, const grumpkin::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, grumpkin::scalar_t* result);
+  const grumpkin::scalar_t* vec_a,
+  const grumpkin::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  grumpkin::scalar_t* result);
 
 extern "C" eIcicleError grumpkin_matrix_transpose(
   const grumpkin::scalar_t* input,
@@ -55,5 +72,3 @@ extern "C" eIcicleError grumpkin_matrix_transpose(
 
 extern "C" eIcicleError grumpkin_bit_reverse(
   const grumpkin::scalar_t* input, uint64_t n, const VecOpsConfig* config, grumpkin::scalar_t* output);
-
-
diff --git a/icicle/include/icicle/api/stark252.h b/icicle/include/icicle/api/stark252.h
index 6a8ff1a74..5020a5966 100644
--- a/icicle/include/icicle/api/stark252.h
+++ b/icicle/include/icicle/api/stark252.h
@@ -14,22 +14,37 @@ extern "C" void stark252_generate_scalars(stark252::scalar_t* scalars, int size)
 extern "C" void stark252_scalar_convert_montgomery(
   const stark252::scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, stark252::scalar_t* output);
 
-extern "C" eIcicleError stark252_ntt_init_domain(
-  stark252::scalar_t* primitive_root, const NTTInitDomainConfig* config);
+extern "C" eIcicleError stark252_ntt_init_domain(stark252::scalar_t* primitive_root, const NTTInitDomainConfig* config);
 
 extern "C" eIcicleError stark252_ntt(
-  const stark252::scalar_t* input, int size, NTTDir dir, const NTTConfig<stark252::scalar_t>* config, stark252::scalar_t* output);
+  const stark252::scalar_t* input,
+  int size,
+  NTTDir dir,
+  const NTTConfig<stark252::scalar_t>* config,
+  stark252::scalar_t* output);
 
 extern "C" eIcicleError stark252_ntt_release_domain();
 
 extern "C" eIcicleError stark252_vector_mul(
-  const stark252::scalar_t* vec_a, const stark252::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, stark252::scalar_t* result);
+  const stark252::scalar_t* vec_a,
+  const stark252::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  stark252::scalar_t* result);
 
 extern "C" eIcicleError stark252_vector_add(
-  const stark252::scalar_t* vec_a, const stark252::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, stark252::scalar_t* result);
+  const stark252::scalar_t* vec_a,
+  const stark252::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  stark252::scalar_t* result);
 
 extern "C" eIcicleError stark252_vector_sub(
-  const stark252::scalar_t* vec_a, const stark252::scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, stark252::scalar_t* result);
+  const stark252::scalar_t* vec_a,
+  const stark252::scalar_t* vec_b,
+  uint64_t n,
+  const VecOpsConfig* config,
+  stark252::scalar_t* result);
 
 extern "C" eIcicleError stark252_matrix_transpose(
   const stark252::scalar_t* input,
@@ -40,5 +55,3 @@ extern "C" eIcicleError stark252_matrix_transpose(
 
 extern "C" eIcicleError stark252_bit_reverse(
   const stark252::scalar_t* input, uint64_t n, const VecOpsConfig* config, stark252::scalar_t* output);
-
-
diff --git a/icicle/include/icicle/backend/vec_ops_backend.h b/icicle/include/icicle/backend/vec_ops_backend.h
index 92610798f..b602e2644 100644
--- a/icicle/include/icicle/backend/vec_ops_backend.h
+++ b/icicle/include/icicle/backend/vec_ops_backend.h
@@ -16,11 +16,7 @@ namespace icicle {
     scalar_t* output)>;
 
   using vectorVectorOpImplInplaceA = std::function<eIcicleError(
-    const Device& device,
-    scalar_t* vec_a,
-    const scalar_t* vec_b,
-    uint64_t size,
-    const VecOpsConfig& config)>;
+    const Device& device, scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config)>;
 
   using scalarConvertMontgomeryImpl = std::function<eIcicleError(
     const Device& device,
@@ -31,11 +27,7 @@ namespace icicle {
     scalar_t* output)>;
 
   using VectorReduceOpImpl = std::function<eIcicleError(
-    const Device& device,
-    const scalar_t* vec_a,
-    uint64_t size,
-    const VecOpsConfig& config,
-    scalar_t* output)>;
+    const Device& device, const scalar_t* vec_a, uint64_t size, const VecOpsConfig& config, scalar_t* output)>;
 
   using scalarVectorOpImpl = std::function<eIcicleError(
     const Device& device,
@@ -55,11 +47,7 @@ namespace icicle {
     scalar_t* out)>;
 
   using scalarBitReverseOpImpl = std::function<eIcicleError(
-    const Device& device,
-    const scalar_t* input,
-    uint64_t size,
-    const VecOpsConfig& config,
-    scalar_t* output)>;
+    const Device& device, const scalar_t* input, uint64_t size, const VecOpsConfig& config, scalar_t* output)>;
 
   using scalarSliceOpImpl = std::function<eIcicleError(
     const Device& device,
@@ -72,11 +60,7 @@ namespace icicle {
     scalar_t* output)>;
 
   using scalarHighNonZeroIdxOpImpl = std::function<eIcicleError(
-    const Device& device,
-    const scalar_t* input,
-    uint64_t size,
-    const VecOpsConfig& config,
-    int64_t* out_idx)>;
+    const Device& device, const scalar_t* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx)>;
 
   using scalarPolyEvalImpl = std::function<eIcicleError(
     const Device& device,
@@ -99,9 +83,6 @@ namespace icicle {
     scalar_t* q_out /*OUT*/,
     scalar_t* r_out /*OUT*/)>;
 
-
-
-
   void register_vector_add(const std::string& deviceType, vectorVectorOpImpl impl);
 
 #define REGISTER_VECTOR_ADD_BACKEND(DEVICE_TYPE, FUNC)                                                                 \
@@ -173,10 +154,10 @@ namespace icicle {
 
   void register_vector_product(const std::string& deviceType, VectorReduceOpImpl impl);
 
-#define REGISTER_VECTOR_PRODUCT_BACKEND(DEVICE_TYPE, FUNC)                                                                 \
+#define REGISTER_VECTOR_PRODUCT_BACKEND(DEVICE_TYPE, FUNC)                                                             \
   namespace {                                                                                                          \
-    static bool UNIQUE(_reg_vec_product) = []() -> bool {                                                                  \
-      register_vector_product(DEVICE_TYPE, FUNC);                                                                          \
+    static bool UNIQUE(_reg_vec_product) = []() -> bool {                                                              \
+      register_vector_product(DEVICE_TYPE, FUNC);                                                                      \
       return true;                                                                                                     \
     }();                                                                                                               \
   }
diff --git a/icicle/include/icicle/fields/complex_extension.h b/icicle/include/icicle/fields/complex_extension.h
index 9b4d35d24..6495822bd 100644
--- a/icicle/include/icicle/fields/complex_extension.h
+++ b/icicle/include/icicle/fields/complex_extension.h
@@ -36,9 +36,15 @@ class ComplexExtensionField
   FF real;
   FF imaginary;
 
-  static constexpr HOST_DEVICE_INLINE ComplexExtensionField zero() { return ComplexExtensionField{FF::zero(), FF::zero()}; }
+  static constexpr HOST_DEVICE_INLINE ComplexExtensionField zero()
+  {
+    return ComplexExtensionField{FF::zero(), FF::zero()};
+  }
 
-  static constexpr HOST_DEVICE_INLINE ComplexExtensionField one() { return ComplexExtensionField{FF::one(), FF::zero()}; }
+  static constexpr HOST_DEVICE_INLINE ComplexExtensionField one()
+  {
+    return ComplexExtensionField{FF::one(), FF::zero()};
+  }
 
   static constexpr HOST_DEVICE_INLINE ComplexExtensionField to_montgomery(const ComplexExtensionField& xs)
   {
@@ -50,7 +56,10 @@ class ComplexExtensionField
     return ComplexExtensionField{xs.real * FF{CONFIG::montgomery_r_inv}, xs.imaginary * FF{CONFIG::montgomery_r_inv}};
   }
 
-  static HOST_INLINE ComplexExtensionField rand_host() { return ComplexExtensionField{FF::rand_host(), FF::rand_host()}; }
+  static HOST_INLINE ComplexExtensionField rand_host()
+  {
+    return ComplexExtensionField{FF::rand_host(), FF::rand_host()};
+  }
 
   static void rand_host_many(ComplexExtensionField* out, int size)
   {
@@ -61,7 +70,8 @@ class ComplexExtensionField
   template <unsigned REDUCTION_SIZE = 1>
   static constexpr HOST_DEVICE_INLINE ComplexExtensionField sub_modulus(const ComplexExtensionField& xs)
   {
-    return ComplexExtensionField{FF::sub_modulus<REDUCTION_SIZE>(&xs.real), FF::sub_modulus<REDUCTION_SIZE>(&xs.imaginary)};
+    return ComplexExtensionField{
+      FF::sub_modulus<REDUCTION_SIZE>(&xs.real), FF::sub_modulus<REDUCTION_SIZE>(&xs.imaginary)};
   }
 
   friend std::ostream& operator<<(std::ostream& os, const ComplexExtensionField& xs)
@@ -101,7 +111,8 @@ class ComplexExtensionField
   }
 
   template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ComplexExtensionField& xs, const ComplexExtensionField& ys)
+  static constexpr HOST_DEVICE_INLINE ExtensionWide
+  mul_wide(const ComplexExtensionField& xs, const ComplexExtensionField& ys)
   {
     FWide real_prod = FF::mul_wide(xs.real, ys.real);
     FWide imaginary_prod = FF::mul_wide(xs.imaginary, ys.imaginary);
@@ -142,7 +153,10 @@ class ComplexExtensionField
     return (xs.real == ys.real) && (xs.imaginary == ys.imaginary);
   }
 
-  friend HOST_DEVICE_INLINE bool operator!=(const ComplexExtensionField& xs, const ComplexExtensionField& ys) { return !(xs == ys); }
+  friend HOST_DEVICE_INLINE bool operator!=(const ComplexExtensionField& xs, const ComplexExtensionField& ys)
+  {
+    return !(xs == ys);
+  }
 
   template <const ComplexExtensionField& multiplier>
   static HOST_DEVICE_INLINE ComplexExtensionField mul_const(const ComplexExtensionField& xs)
diff --git a/icicle/include/icicle/fields/quartic_extension.h b/icicle/include/icicle/fields/quartic_extension.h
index 923b31f3a..2ba17c05c 100644
--- a/icicle/include/icicle/fields/quartic_extension.h
+++ b/icicle/include/icicle/fields/quartic_extension.h
@@ -119,7 +119,8 @@ class QuarticExtensionField
   }
 
   template <unsigned MODULUS_MULTIPLE = 1>
-  static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const QuarticExtensionField& xs, const QuarticExtensionField& ys)
+  static constexpr HOST_DEVICE_INLINE ExtensionWide
+  mul_wide(const QuarticExtensionField& xs, const QuarticExtensionField& ys)
   {
     if (CONFIG::nonresidue_is_negative)
       return ExtensionWide{
@@ -179,7 +180,10 @@ class QuarticExtensionField
     return (xs.real == ys.real) && (xs.im1 == ys.im1) && (xs.im2 == ys.im2) && (xs.im3 == ys.im3);
   }
 
-  friend HOST_DEVICE_INLINE bool operator!=(const QuarticExtensionField& xs, const QuarticExtensionField& ys) { return !(xs == ys); }
+  friend HOST_DEVICE_INLINE bool operator!=(const QuarticExtensionField& xs, const QuarticExtensionField& ys)
+  {
+    return !(xs == ys);
+  }
 
   template <uint32_t multiplier, unsigned REDUCTION_SIZE = 1>
   static constexpr HOST_DEVICE_INLINE QuarticExtensionField mul_unsigned(const QuarticExtensionField& xs)
diff --git a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
index 0ee0e2d0f..c7e53b218 100644
--- a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
+++ b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
@@ -278,15 +278,7 @@ namespace icicle {
       config.is_result_on_device = true;
 
       ICICLE_CHECK(icicle::polynomial_division(
-        a_coeffs,
-        deg_a,
-        b_coeffs,
-        deg_b,
-        deg_a - deg_b + 1,
-        a_N,
-        config,
-        Q_coeffs,
-        R_coeffs));
+        a_coeffs, deg_a, b_coeffs, deg_b, deg_a - deg_b + 1, a_N, config, Q_coeffs, R_coeffs));
     }
 
     void quotient(PolyContext Q, PolyContext op_a, PolyContext op_b) override
@@ -554,8 +546,8 @@ namespace icicle {
         config.is_result_on_device = true;
         config.is_async = true;
         config.stream = m_stream;
-        ICICLE_CHECK(
-          icicle::slice(get_context_storage_immutable<I>(p), 0 /*offset*/, stride, poly_size, domain_size, config, d_evals));
+        ICICLE_CHECK(icicle::slice(
+          get_context_storage_immutable<I>(p), 0 /*offset*/, stride, poly_size, domain_size, config, d_evals));
       } else {
         ICICLE_CHECK(icicle_memset(d_evals, 0, domain_size * sizeof(I)));
         auto ntt_config = default_ntt_config<D>();
diff --git a/icicle/include/icicle/utils/modifiers.h b/icicle/include/icicle/utils/modifiers.h
index 74520c9f9..ac62028a8 100644
--- a/icicle/include/icicle/utils/modifiers.h
+++ b/icicle/include/icicle/utils/modifiers.h
@@ -14,7 +14,7 @@
 
 #define HOST_INLINE        __host__ INLINE_MACRO
 #define DEVICE_INLINE      __device__ INLINE_MACRO
-#define HOST_DEVICE __host__ __device__
+#define HOST_DEVICE        __host__ __device__
 #define HOST_DEVICE_INLINE HOST_DEVICE INLINE_MACRO
 #else // not CUDA
 #define INLINE_MACRO
diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h
index b89327eb4..524cbcdc5 100644
--- a/icicle/include/icicle/vec_ops.h
+++ b/icicle/include/icicle/vec_ops.h
@@ -17,22 +17,21 @@ namespace icicle {
    * @note APIs with a single input, ignore input b.
    */
   struct VecOpsConfig {
-    icicleStreamHandle stream;  /** Stream for asynchronous execution. */
-    bool is_a_on_device;        /** True if `a` is on the device, false if it is not. Default value: false. */
-    bool is_b_on_device;        /** True if `b` is on the device, false if it is not. Default value: false. OPTIONAL. */
-    bool is_result_on_device;   /** If true, the output is preserved on the device, otherwise on the host. Default value:
-                                    false. */
-    bool is_async;              /** Whether to run the vector operations asynchronously.
-                                    If set to `true`, the function will be non-blocking and synchronization
-                                    must be explicitly managed using `cudaStreamSynchronize` or `cudaDeviceSynchronize`.
-                                    If set to `false`, the function will block the current CPU thread. */
-    int batch_size;             /** Number of vectors (or operations) to process in a batch.
-                                    Each vector operation will be performed independently on each batch element.
-                                    Default value: 1. */
-    bool
-      columns_batch;            /** True if the batched vectors are stored as columns in a 2D array (i.e., the vectors are strided in memory as columns of a matrix).
-                                    If false, the batched vectors are stored contiguously in memory (e.g., as rows or in a flat array).
-                                    Default value: false. */
+    icicleStreamHandle stream; /** Stream for asynchronous execution. */
+    bool is_a_on_device;       /** True if `a` is on the device, false if it is not. Default value: false. */
+    bool is_b_on_device;       /** True if `b` is on the device, false if it is not. Default value: false. OPTIONAL. */
+    bool is_result_on_device;  /** If true, the output is preserved on the device, otherwise on the host. Default value:
+                                   false. */
+    bool is_async;             /** Whether to run the vector operations asynchronously.
+                                   If set to `true`, the function will be non-blocking and synchronization
+                                   must be explicitly managed using `cudaStreamSynchronize` or `cudaDeviceSynchronize`.
+                                   If set to `false`, the function will block the current CPU thread. */
+    int batch_size;            /** Number of vectors (or operations) to process in a batch.
+                                   Each vector operation will be performed independently on each batch element.
+                                   Default value: 1. */
+    bool columns_batch; /** True if the batched vectors are stored as columns in a 2D array (i.e., the vectors are
+                           strided in memory as columns of a matrix). If false, the batched vectors are stored
+                           contiguously in memory (e.g., as rows or in a flat array). Default value: false. */
     ConfigExtension* ext = nullptr; /** Backend-specific extension. */
   };
 
@@ -93,7 +92,8 @@ namespace icicle {
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
-  eIcicleError vector_accumulate(T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config); // use vector_add (inplace)
+  eIcicleError
+  vector_accumulate(T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config); // use vector_add (inplace)
 
   /**
    * @brief Subtracts vector `b` from vector `a` element-wise.
@@ -172,7 +172,8 @@ namespace icicle {
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
-  eIcicleError convert_montgomery(const T* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, T* output);
+  eIcicleError
+  convert_montgomery(const T* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, T* output);
 
   // Reduction operations
 
@@ -195,25 +196,23 @@ namespace icicle {
   eIcicleError vector_sum(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output);
 
   /**
-  * @brief Computes the product of all elements in each vector in the batch.
-  *
-  * @tparam T Type of the elements in the vectors.
-  * @param vec_a Pointer to the input vector(s).
-  *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
-  *              - The layout depends on `config.columns_batch`:
-  *                - If `false`, vectors are stored contiguously.
-  *                - If `true`, vectors are stored as columns in a 2D array.
-  * @param size Number of elements in each vector.
-  * @param config Configuration for the operation.
-  * @param output Pointer to the output array where the results will be stored.
-  * @return eIcicleError Error code indicating success or failure.
-  */
+   * @brief Computes the product of all elements in each vector in the batch.
+   *
+   * @tparam T Type of the elements in the vectors.
+   * @param vec_a Pointer to the input vector(s).
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param size Number of elements in each vector.
+   * @param config Configuration for the operation.
+   * @param output Pointer to the output array where the results will be stored.
+   * @return eIcicleError Error code indicating success or failure.
+   */
 
   template <typename T>
   eIcicleError vector_product(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output);
 
-
-
   // Scalar-Vector operations
 
   /**
@@ -222,21 +221,24 @@ namespace icicle {
    * @tparam T Type of the elements in the vector and the scalar.
    * @param scalar_a Pointer to the input scalar(s).
    *                 - If `use_single_scalar` is `true`, this should point to a single scalar value.
-   *                 - If `use_single_scalar` is `false`, this should point to an array of scalars with length `config.batch_size`.
-   * @param vec_b Pointer to the input vector(s).   
+   *                 - If `use_single_scalar` is `false`, this should point to an array of scalars with length
+   * `config.batch_size`.
+   * @param vec_b Pointer to the input vector(s).
    *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
    *              - The layout depends on `config.columns_batch`:
    *                - If `false`, vectors are stored contiguously.
    *                - If `true`, vectors are stored as columns in a 2D array.
    * @param size Number of elements in a vector.
-   * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of scalars (`false`).
+   * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of
+   * scalars (`false`).
    * @param config Configuration for the operation.
    * @param output Pointer to the output vector(s) where the results will be stored.
    * @return eIcicleError Error code indicating success or failure.
-    * @note To subtract a scalar from each element of a vector - use scalar_add_vec with negative scalar.
+   * @note To subtract a scalar from each element of a vector - use scalar_add_vec with negative scalar.
    */
   template <typename T>
-  eIcicleError scalar_add_vec(const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output); 
+  eIcicleError scalar_add_vec(
+    const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output);
 
   /**
    * @brief Subtracts each element of a vector from a scalar, elementwise (res[i]=scalar-vec[i]).
@@ -244,21 +246,24 @@ namespace icicle {
    * @tparam T Type of the elements in the vector and the scalar.
    * @param scalar_a Pointer to Input scalar(s).
    *                 - If `use_single_scalar` is `true`, this should point to a single scalar value.
-   *                 - If `use_single_scalar` is `false`, this should point to an array of scalars with length `config.batch_size`.
-   * @param vec_b Pointer to the input vector(s).   
+   *                 - If `use_single_scalar` is `false`, this should point to an array of scalars with length
+   * `config.batch_size`.
+   * @param vec_b Pointer to the input vector(s).
    *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
    *              - The layout depends on `config.columns_batch`:
    *                - If `false`, vectors are stored contiguously.
    *                - If `true`, vectors are stored as columns in a 2D array.
    * @param size Number of elements in a vector.
-   * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of scalars (`false`).
+   * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of
+   * scalars (`false`).
    * @param config Configuration for the operation.
    * @param output Pointer to the output vector(s) where the results will be stored.
    * @return eIcicleError Error code indicating success or failure.
    * @note To subtract a scalar from each element of a vector - use scalar_add_vec with negative scalar.
    */
   template <typename T>
-  eIcicleError scalar_sub_vec(const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output); 
+  eIcicleError scalar_sub_vec(
+    const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output);
 
   /**
    * @brief Multiplies each element of a vector by a scalar.
@@ -266,20 +271,23 @@ namespace icicle {
    * @tparam T Type of the elements in the vector and the scalar.
    * @param scalar_a Pointer to Input scalar(s).
    *                 - If `use_single_scalar` is `true`, this should point to a single scalar value.
-   *                 - If `use_single_scalar` is `false`, this should point to an array of scalars with length `config.batch_size`.
-   * @param vec_b Pointer to the input vector(s).   
+   *                 - If `use_single_scalar` is `false`, this should point to an array of scalars with length
+   * `config.batch_size`.
+   * @param vec_b Pointer to the input vector(s).
    *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
    *              - The layout depends on `config.columns_batch`:
    *                - If `false`, vectors are stored contiguously.
    *                - If `true`, vectors are stored as columns in a 2D array.
    * @param size Number of elements in a vector.
-   * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of scalars (`false`).
+   * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of
+   * scalars (`false`).
    * @param config Configuration for the operation.
    * @param output Pointer to the output vector(s) where the results will be stored.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
-  eIcicleError scalar_mul_vec(const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output);
+  eIcicleError scalar_mul_vec(
+    const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output);
 
   // Matrix operations
 
@@ -294,14 +302,13 @@ namespace icicle {
    * @param mat_out Pointer to the output matrix or matrices where the transposed matrices will be stored.
    * @return eIcicleError Error code indicating success or failure.
    * @note The input matrices are assumed to be stored in row-major order.
-   *       This function transposes an input matrix or a batch of matrices. 
+   *       This function transposes an input matrix or a batch of matrices.
    *       Matrix transpose inplace is not supported for non-power of 2 rows and columns.
    */
   template <typename T>
   eIcicleError
   matrix_transpose(const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out);
-  
-  
+
   // Miscellaneous operations
 
   /**
@@ -309,10 +316,10 @@ namespace icicle {
    *
    * @tparam T Type of the elements in the vector.
    * @param vec_in Pointer to the input vector(s).
-  *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
-  *              - The layout depends on `config.columns_batch`:
-  *                - If `false`, vectors are stored contiguously.
-  *                - If `true`, vectors are stored as columns in a 2D array.
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously.
+   *                - If `true`, vectors are stored as columns in a 2D array.
    * @param size Number of elements in each vector. Must be a power of 2.
    * @param config Configuration for the operation.
    * @param vec_out Pointer to the output vector(s) where the results will be stored.
@@ -332,7 +339,7 @@ namespace icicle {
    * @param stride Stride between elements in the slice.
    * @param size_in Number of elements in one input vector.
    * @param size_out Number of elements in one input vector.
-    * @param config Configuration for the operation.
+   * @param config Configuration for the operation.
    * @param vec_out Pointer to the output vector(s) where the results will be stored.
    *                The output array should have the same storage layout as the input vectors.
    * @return eIcicleError Error code indicating success or failure.
@@ -341,8 +348,14 @@ namespace icicle {
    *       parameters must satisfy: offset + (size_out-1) * stride < size_in
    */
   template <typename T>
-  eIcicleError
-  slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size_in, uint64_t size_out, const VecOpsConfig& config, T* vec_out);
+  eIcicleError slice(
+    const T* vec_in,
+    uint64_t offset,
+    uint64_t stride,
+    uint64_t size_in,
+    uint64_t size_out,
+    const VecOpsConfig& config,
+    T* vec_out);
 
   /**
    * @brief Finds the highest non-zero index in a vector or batch of vectors.
@@ -351,8 +364,8 @@ namespace icicle {
    * @param vec_in Pointer to the input vector(s).
    * @param size Number of elements in each input vector.
    * @param config Configuration for the operation.
-   * @param out_idx Pointer to an array where the output indices of the highest non-zero element in each input vector will be stored.
-   *                The array should have a length of `config.batch_size`.
+   * @param out_idx Pointer to an array where the output indices of the highest non-zero element in each input vector
+   * will be stored. The array should have a length of `config.batch_size`.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
@@ -364,18 +377,19 @@ namespace icicle {
    * @tparam T Type of the elements in the polynomial and domain.
    * @param coeffs Pointer to the array of coefficients of the polynomial(s).
    *               - The size of `coeffs` should be `coeffs_size * batch_size`.
-   *               - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored contiguously.
+   *               - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored
+   * contiguously.
    *               - If `config.columns_batch` is `true`, coefficients are interleaved.
    * @param coeffs_size Number of coefficients in each polynomial.
    * @param domain Pointer to the array of points at which to evaluate the polynomial(s).
-    *               - The same domain is used for all polynomials.
-    *               - The size of `domain` should be `domain_size`.
+   *               - The same domain is used for all polynomials.
+   *               - The size of `domain` should be `domain_size`.
    * @param domain_size Number of domain points.
    * @param config Configuration for the operation.
    * @param evals Pointer to the array where the evaluated results will be stored. This is an output parameter.
- *              - The size of `evals` should be `domain_size * batch_size`.
- *              - If `config.columns_batch` is `false`, results for each polynomial are stored contiguously.
- *              - If `config.columns_batch` is `true`, results are interleaved.
+   *              - The size of `evals` should be `domain_size * batch_size`.
+   *              - If `config.columns_batch` is `false`, results for each polynomial are stored contiguously.
+   *              - If `config.columns_batch` is `true`, results are interleaved.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
@@ -393,7 +407,8 @@ namespace icicle {
    * @tparam T Type of the elements in the polynomials.
    * @param numerator Pointer to the array of coefficients of the numerator polynomial(s).
    *                  - The size of `numerator` should be `(numerator_deg + 1) * batch_size`.
-   *                  - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored contiguously.
+   *                  - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored
+   * contiguously.
    *                  - If `config.columns_batch` is `true`, coefficients are interleaved.
    * @param numerator_deg Degree of the numerator polynomial.
    * @param denominator Pointer to the array of coefficients of the denominator polynomial(s).
@@ -410,8 +425,8 @@ namespace icicle {
    * @return eIcicleError Error code indicating success or failure.
    *
    * @note The degrees should satisfy `numerator_deg >= denominator_deg`.
-   *       The sizes `q_size` and `r_size` must be at least `numerator_deg - denominator_deg + 1` and `denominator_deg`, respectively.
-   *       The function assumes that the input and output arrays are properly allocated.
+   *       The sizes `q_size` and `r_size` must be at least `numerator_deg - denominator_deg + 1` and `denominator_deg`,
+   * respectively. The function assumes that the input and output arrays are properly allocated.
    */
   template <typename T>
   eIcicleError polynomial_division(
diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp
index 2c16ed389..c97fe3e1f 100644
--- a/icicle/src/vec_ops.cpp
+++ b/icicle/src/vec_ops.cpp
@@ -3,7 +3,6 @@
 
 namespace icicle {
 
-
   /*********************************** REDUCE PRODUCT ************************/
   ICICLE_DISPATCHER_INST(VectorProductDispatcher, vector_product, VectorReduceOpImpl);
 
@@ -14,24 +13,22 @@ namespace icicle {
   }
 
   template <>
-  eIcicleError
-  vector_product(const scalar_t* vec_a, uint64_t size, const VecOpsConfig& config, scalar_t* output)
+  eIcicleError vector_product(const scalar_t* vec_a, uint64_t size, const VecOpsConfig& config, scalar_t* output)
   {
     return CONCAT_EXPAND(FIELD, vector_product)(vec_a, size, &config, output);
   }
 
   /*********************************** REDUCE SUM ****************************/
-  ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, VectorReduceOpImpl );
+  ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, VectorReduceOpImpl);
 
-  extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_sum)(
-    const scalar_t* vec_a, uint64_t size, const VecOpsConfig* config, scalar_t* output)
+  extern "C" eIcicleError
+  CONCAT_EXPAND(FIELD, vector_sum)(const scalar_t* vec_a, uint64_t size, const VecOpsConfig* config, scalar_t* output)
   {
     return VectorSumDispatcher::execute(vec_a, size, *config, output);
   }
 
   template <>
-  eIcicleError
-  vector_sum(const scalar_t* vec_a, uint64_t size, const VecOpsConfig& config, scalar_t* output)
+  eIcicleError vector_sum(const scalar_t* vec_a, uint64_t size, const VecOpsConfig& config, scalar_t* output)
   {
     return CONCAT_EXPAND(FIELD, vector_sum)(vec_a, size, &config, output);
   }
@@ -94,7 +91,8 @@ namespace icicle {
   }
 
   template <>
-  eIcicleError vector_accumulate(extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config)
+  eIcicleError
+  vector_accumulate(extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config)
   {
     return CONCAT_EXPAND(FIELD, extension_vector_accumulate)(vec_a, vec_b, size, &config);
   }
@@ -186,14 +184,24 @@ namespace icicle {
   ICICLE_DISPATCHER_INST(ScalarAddDispatcher, scalar_add_vec, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_add_vec)(
-    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* scalar_a,
+    const scalar_t* vec_b,
+    uint64_t size,
+    bool use_single_scalar,
+    const VecOpsConfig* config,
+    scalar_t* output)
   {
     return ScalarAddDispatcher::execute(scalar_a, vec_b, size, use_single_scalar, *config, output);
   }
 
   template <>
   eIcicleError scalar_add_vec(
-    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, scalar_t* output)
+    const scalar_t* scalar_a,
+    const scalar_t* vec_b,
+    uint64_t size,
+    bool use_single_scalar,
+    const VecOpsConfig& config,
+    scalar_t* output)
   {
     return CONCAT_EXPAND(FIELD, scalar_add_vec)(scalar_a, vec_b, size, use_single_scalar, &config, output);
   }
@@ -202,14 +210,24 @@ namespace icicle {
   ICICLE_DISPATCHER_INST(ScalarSubDispatcher, scalar_sub_vec, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_sub_vec)(
-    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* scalar_a,
+    const scalar_t* vec_b,
+    uint64_t size,
+    bool use_single_scalar,
+    const VecOpsConfig* config,
+    scalar_t* output)
   {
     return ScalarSubDispatcher::execute(scalar_a, vec_b, size, use_single_scalar, *config, output);
   }
 
   template <>
   eIcicleError scalar_sub_vec(
-    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, scalar_t* output)
+    const scalar_t* scalar_a,
+    const scalar_t* vec_b,
+    uint64_t size,
+    bool use_single_scalar,
+    const VecOpsConfig& config,
+    scalar_t* output)
   {
     return CONCAT_EXPAND(FIELD, scalar_sub_vec)(scalar_a, vec_b, size, use_single_scalar, &config, output);
   }
@@ -217,14 +235,24 @@ namespace icicle {
   ICICLE_DISPATCHER_INST(ScalarMulDispatcher, scalar_mul_vec, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_mul_vec)(
-    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* scalar_a,
+    const scalar_t* vec_b,
+    uint64_t size,
+    bool use_single_scalar,
+    const VecOpsConfig* config,
+    scalar_t* output)
   {
     return ScalarMulDispatcher::execute(scalar_a, vec_b, size, use_single_scalar, *config, output);
   }
 
   template <>
   eIcicleError scalar_mul_vec(
-    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, scalar_t* output)
+    const scalar_t* scalar_a,
+    const scalar_t* vec_b,
+    uint64_t size,
+    bool use_single_scalar,
+    const VecOpsConfig& config,
+    scalar_t* output)
   {
     return CONCAT_EXPAND(FIELD, scalar_mul_vec)(scalar_a, vec_b, size, use_single_scalar, &config, output);
   }
@@ -240,8 +268,8 @@ namespace icicle {
   }
 
   template <>
-  eIcicleError
-  convert_montgomery(const scalar_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, scalar_t* output)
+  eIcicleError convert_montgomery(
+    const scalar_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, scalar_t* output)
   {
     return CONCAT_EXPAND(FIELD, scalar_convert_montgomery)(input, size, is_to_montgomery, &config, output);
   }
@@ -431,4 +459,4 @@ namespace icicle {
       numerator, numerator_deg, denumerator, denumerator_deg, q_size, r_size, &config, q_out, r_out);
   }
 
-} // sizeamespace icicle
\ No newline at end of file
+} // namespace icicle
\ No newline at end of file
diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index 5aa9dd973..50d4b0d8f 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -6,7 +6,6 @@
 #include <random>
 #include <cstdlib> // For system
 
-
 #include "icicle/runtime.h"
 #include "icicle/vec_ops.h"
 #include "icicle/ntt.h"
@@ -31,7 +30,6 @@ static inline std::string s_main_target;
 static inline std::string s_reference_target;
 static const bool s_is_cuda_registered = is_device_registered("CUDA");
 
-
 template <typename T>
 class FieldApiTest : public ::testing::Test
 {
@@ -89,7 +87,6 @@ TYPED_TEST(FieldApiTest, FieldSanityTest)
   ASSERT_EQ(a * scalar_t::from(2), a + a);
 }
 
-
 TYPED_TEST(FieldApiTest, vectorVectorOps)
 {
   int seed = time(0);
@@ -102,7 +99,7 @@ TYPED_TEST(FieldApiTest, vectorVectorOps)
   auto in_a = std::make_unique<TypeParam[]>(total_size);
   auto in_b = std::make_unique<TypeParam[]>(total_size);
   auto out_main = std::make_unique<TypeParam[]>(total_size);
-  auto  out_ref = std::make_unique<TypeParam[]>(total_size);
+  auto out_ref = std::make_unique<TypeParam[]>(total_size);
 
   auto vector_accumulate_wrapper =
     [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) {
@@ -127,74 +124,83 @@ TYPED_TEST(FieldApiTest, vectorVectorOps)
       END_TIMER(VECADD_sync, oss.str().c_str(), measure);
     };
 
-
   // warmup
   // run(s_reference_target, out_ref.get(), false /*=measure*/, 16 /*=iters*/);
   // run(s_main_target, out_main.get(), false /*=measure*/, 1 /*=iters*/);
-  
+
   // warmup
   // run(s_reference_target, out_ref.get(), false /*=measure*/, 16 /*=iters*/);
   // run(s_main_target, out_main.get(), false /*=measure*/, 1 /*=iters*/);
-  
+
   // Element-wise vector operations
-  // If config.batch_size>1, (columns_batch=true or false) the operation is done element-wise anyway, so it doesn't affect the test
+  // If config.batch_size>1, (columns_batch=true or false) the operation is done element-wise anyway, so it doesn't
+  // affect the test
 
   // // add
-  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size);
-  FieldApiTest<TypeParam>::random_samples(in_b.get(),total_size);
+  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
+  FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
   if (!s_is_cuda_registered) {
-    for (int i = 0; i < total_size; i++) {  out_ref[i] = in_a[i] + in_b[i]; }
+    for (int i = 0; i < total_size; i++) {
+      out_ref[i] = in_a[i] + in_b[i];
+    }
   } else {
     run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_add<TypeParam>, "vector add", ITERS);
   }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_add<TypeParam>, "vector add", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), total_size * sizeof(TypeParam)));
-  
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
+
   // // accumulate
-  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size);
-  FieldApiTest<TypeParam>::random_samples(in_b.get(),total_size);
+  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
+  FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
   if (!s_is_cuda_registered) {
-    for (int i = 0; i < total_size; i++) {  out_ref[i] = in_a[i] + in_b[i]; }
+    for (int i = 0; i < total_size; i++) {
+      out_ref[i] = in_a[i] + in_b[i];
+    }
   } else {
     run(s_reference_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS);
   }
   run(s_main_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS);
-  ASSERT_EQ(0, memcmp(in_a.get(),  out_ref.get(), total_size * sizeof(TypeParam)));
+  ASSERT_EQ(0, memcmp(in_a.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 
   // // sub
-  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size);
-  FieldApiTest<TypeParam>::random_samples(in_b.get(),total_size);
+  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
+  FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
   if (!s_is_cuda_registered) {
-    for (int i = 0; i < total_size; i++) {  out_ref[i] = in_a[i] - in_b[i]; }
+    for (int i = 0; i < total_size; i++) {
+      out_ref[i] = in_a[i] - in_b[i];
+    }
   } else {
     run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sub<TypeParam>, "vector sub", ITERS);
   }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sub<TypeParam>, "vector sub", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), total_size * sizeof(TypeParam)));
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 
   // // mul
-  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size);
-  FieldApiTest<TypeParam>::random_samples(in_b.get(),total_size);
+  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
+  FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
   if (!s_is_cuda_registered) {
-    for (int i = 0; i < total_size; i++) {  out_ref[i] = in_a[i] * in_b[i]; }
+    for (int i = 0; i < total_size; i++) {
+      out_ref[i] = in_a[i] * in_b[i];
+    }
   } else {
     run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_mul<TypeParam>, "vector mul", ITERS);
   }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_mul<TypeParam>, "vector mul", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), total_size * sizeof(TypeParam)));
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 
   // // div
-  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size);
-  FieldApiTest<TypeParam>::random_samples(in_b.get(),total_size);
+  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
+  FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
   // reference
   if (!s_is_cuda_registered) {
-    for (int i = 0; i < total_size; i++) {  out_ref[i] = in_a[i] * TypeParam::inverse(in_b[i]); }
+    for (int i = 0; i < total_size; i++) {
+      out_ref[i] = in_a[i] * TypeParam::inverse(in_b[i]);
+    }
   } else {
     run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_div<TypeParam>, "vector div", ITERS);
   }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_div<TypeParam>, "vector div", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), total_size * sizeof(TypeParam)));
-
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 }
 
 TYPED_TEST(FieldApiTest, montgomeryConversion)
@@ -209,7 +215,7 @@ TYPED_TEST(FieldApiTest, montgomeryConversion)
   const int total_size = N * batch_size;
   auto in_a = std::make_unique<TypeParam[]>(total_size);
   auto out_main = std::make_unique<TypeParam[]>(total_size);
-  auto  out_ref = std::make_unique<TypeParam[]>(total_size);
+  auto out_ref = std::make_unique<TypeParam[]>(total_size);
 
   auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) {
     Device dev = {dev_type, 0};
@@ -229,22 +235,29 @@ TYPED_TEST(FieldApiTest, montgomeryConversion)
   };
 
   // Element-wise operation
-  // If config.batch_size>1, (columns_batch=true or false) the addition is done element-wise anyway, so it doesn't affect the test
+  // If config.batch_size>1, (columns_batch=true or false) the addition is done element-wise anyway, so it doesn't
+  // affect the test
 
   // convert_montgomery
-  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size);
+  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
   // reference
   if (!s_is_cuda_registered) {
-    if (is_to_montgomery) { for (int i = 0; i < total_size; i++) {  out_ref[i] = TypeParam::to_montgomery(in_a[i]); } }
-    else                  { for (int i = 0; i < total_size; i++) {  out_ref[i] = TypeParam::from_montgomery(in_a[i]); } }
+    if (is_to_montgomery) {
+      for (int i = 0; i < total_size; i++) {
+        out_ref[i] = TypeParam::to_montgomery(in_a[i]);
+      }
+    } else {
+      for (int i = 0; i < total_size; i++) {
+        out_ref[i] = TypeParam::from_montgomery(in_a[i]);
+      }
+    }
   } else {
     run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "montgomery", ITERS);
   }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "montgomery", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), total_size * sizeof(TypeParam)));
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 }
 
-
 TYPED_TEST(FieldApiTest, VectorReduceOps)
 {
   int seed = time(0);
@@ -256,7 +269,7 @@ TYPED_TEST(FieldApiTest, VectorReduceOps)
   const int total_size = N * batch_size;
   auto in_a = std::make_unique<TypeParam[]>(total_size);
   auto out_main = std::make_unique<TypeParam[]>(batch_size);
-  auto  out_ref = std::make_unique<TypeParam[]>(batch_size);
+  auto out_ref = std::make_unique<TypeParam[]>(batch_size);
 
   auto vector_accumulate_wrapper =
     [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) {
@@ -280,44 +293,43 @@ TYPED_TEST(FieldApiTest, VectorReduceOps)
       }
       END_TIMER(VECADD_sync, oss.str().c_str(), measure);
     };
-  
+
   // // sum
-  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size);
+  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
   // reference
   for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
-     out_ref[idx_in_batch] = TypeParam::from(0);
+    out_ref[idx_in_batch] = TypeParam::from(0);
   }
   if (!s_is_cuda_registered) {
     for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
       for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) {
-        uint64_t idx_a = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N;
-        out_ref[idx_in_batch] =  out_ref[idx_in_batch] + in_a[idx_a]; 
+        uint64_t idx_a = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N;
+        out_ref[idx_in_batch] = out_ref[idx_in_batch] + in_a[idx_a];
       }
     }
   } else {
     run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sum<TypeParam>, "vector sum", ITERS);
   }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sum<TypeParam>, "vector sum", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), batch_size * sizeof(TypeParam)));
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(TypeParam)));
 
-  
   // // product
-  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size);
+  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
   if (!s_is_cuda_registered) {
     for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
       out_ref[idx_in_batch] = TypeParam::from(1);
     }
     for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
       for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) {
-        uint64_t idx_a = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N;
-        out_ref[idx_in_batch] =  out_ref[idx_in_batch]*in_a[idx_a]; 
+        uint64_t idx_a = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N;
+        out_ref[idx_in_batch] = out_ref[idx_in_batch] * in_a[idx_a];
       }
     }
   } else {
     run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_product<TypeParam>, "vector product", ITERS);
   }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_product<TypeParam>, "vector product", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), batch_size * sizeof(TypeParam)));
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(TypeParam)));
 }
 
 TYPED_TEST(FieldApiTest, scalarVectorOps)
@@ -330,10 +342,10 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
   const bool columns_batch = rand() % 2;
   const bool use_single_scalar = rand() % 2;
   const int total_size = N * batch_size;
-  auto scalar_a = std::make_unique<TypeParam[]>(use_single_scalar? 1 : batch_size);
+  auto scalar_a = std::make_unique<TypeParam[]>(use_single_scalar ? 1 : batch_size);
   auto in_b = std::make_unique<TypeParam[]>(total_size);
   auto out_main = std::make_unique<TypeParam[]>(total_size);
-  auto  out_ref = std::make_unique<TypeParam[]>(total_size);
+  auto out_ref = std::make_unique<TypeParam[]>(total_size);
 
   auto vector_accumulate_wrapper =
     [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) {
@@ -357,35 +369,34 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
       }
       END_TIMER(VECADD_sync, oss.str().c_str(), measure);
     };
-  
+
   // // scalar add vec
-  FieldApiTest<TypeParam>::random_samples(scalar_a.get(),(use_single_scalar? 1: batch_size));
-  FieldApiTest<TypeParam>::random_samples(in_b.get(),total_size);
-  
+  FieldApiTest<TypeParam>::random_samples(scalar_a.get(), (use_single_scalar ? 1 : batch_size));
+  FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
+
   // reference
   if (!s_is_cuda_registered) {
     for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
       for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) {
-        uint64_t idx_b = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N;
-        out_ref[idx_b] = (use_single_scalar? scalar_a[0] : scalar_a[idx_in_batch]) + in_b[idx_b]; 
+        uint64_t idx_b = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N;
+        out_ref[idx_b] = (use_single_scalar ? scalar_a[0] : scalar_a[idx_in_batch]) + in_b[idx_b];
       }
     }
   } else {
     run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_add_vec<TypeParam>, "scalar add vec", ITERS);
   }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_add_vec<TypeParam>, "scalar add vec", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), total_size * sizeof(TypeParam)));
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 
-  
   // // scalar sub vec
-  FieldApiTest<TypeParam>::random_samples(scalar_a.get(),(use_single_scalar? 1: batch_size));
-  FieldApiTest<TypeParam>::random_samples(in_b.get(),total_size);
-  
+  FieldApiTest<TypeParam>::random_samples(scalar_a.get(), (use_single_scalar ? 1 : batch_size));
+  FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
+
   if (!s_is_cuda_registered) {
     for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
       for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) {
-        uint64_t idx_b = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N;
-        out_ref[idx_b] = (use_single_scalar? scalar_a[0] : scalar_a[idx_in_batch]) - in_b[idx_b]; 
+        uint64_t idx_b = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N;
+        out_ref[idx_b] = (use_single_scalar ? scalar_a[0] : scalar_a[idx_in_batch]) - in_b[idx_b];
       }
     }
   } else {
@@ -393,24 +404,24 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
   }
 
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_sub_vec<TypeParam>, "scalar sub vec", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), total_size * sizeof(TypeParam)));
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 
   // // scalar mul vec
-  FieldApiTest<TypeParam>::random_samples(scalar_a.get(),(use_single_scalar? 1: batch_size));
-  FieldApiTest<TypeParam>::random_samples(in_b.get(),total_size);
-  
+  FieldApiTest<TypeParam>::random_samples(scalar_a.get(), (use_single_scalar ? 1 : batch_size));
+  FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
+
   if (!s_is_cuda_registered) {
     for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
       for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) {
-        uint64_t idx_b = columns_batch ? idx_in_N*batch_size + idx_in_batch : idx_in_batch*N + idx_in_N;
-        out_ref[idx_b] = (use_single_scalar? scalar_a[0] : scalar_a[idx_in_batch]) * in_b[idx_b]; 
+        uint64_t idx_b = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N;
+        out_ref[idx_b] = (use_single_scalar ? scalar_a[0] : scalar_a[idx_in_batch]) * in_b[idx_b];
       }
     }
   } else {
     run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_mul_vec<TypeParam>, "scalar mul vec", ITERS);
   }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_mul_vec<TypeParam>, "scalar mul vec", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(),  out_ref.get(), total_size * sizeof(TypeParam)));
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 }
 
 TYPED_TEST(FieldApiTest, matrixAPIsAsync)
@@ -418,12 +429,15 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
   int seed = time(0);
   srand(seed);
   // ICICLE_LOG_DEBUG << "seed = " << seed;
-  const int R = 1 << (rand() % 8 + 2); // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2
-  const int C = 1 << (rand() % 8 + 2); // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2
+  const int R =
+    1 << (rand() % 8 + 2); // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2
+  const int C =
+    1 << (rand() % 8 + 2); // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2
   const int batch_size = 1 << (rand() % 4);
   const bool columns_batch = rand() % 2;
   const bool is_in_place = rand() % 2;
-  // ICICLE_LOG_DEBUG << "R = " << R << ", C = " << C << ", batch_size = " << batch_size << ", columns_batch = " << columns_batch << ", is_in_place = " << is_in_place; //TODO SHANIE - remove this
+  // ICICLE_LOG_DEBUG << "R = " << R << ", C = " << C << ", batch_size = " << batch_size << ", columns_batch = " <<
+  // columns_batch << ", is_in_place = " << is_in_place; //TODO SHANIE - remove this
   const int total_size = R * C * batch_size;
   auto h_inout = std::make_unique<TypeParam[]>(total_size);
   auto h_out_main = std::make_unique<TypeParam[]>(total_size);
@@ -490,36 +504,37 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
   // }
 
   // Option 3: Initialize the entire input array with random values
-  FieldApiTest<TypeParam>::random_samples(h_inout.get(),total_size);
+  FieldApiTest<TypeParam>::random_samples(h_inout.get(), total_size);
 
   // Reference implementation
   if (!s_is_cuda_registered) {
     const TypeParam* cur_mat_in = h_inout.get();
     TypeParam* cur_mat_out = h_out_ref.get();
-    uint32_t stride = columns_batch? batch_size : 1;
+    uint32_t stride = columns_batch ? batch_size : 1;
     const uint64_t total_elements_one_mat = static_cast<uint64_t>(R) * C;
     for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
       // Perform the matrix transpose
       for (uint32_t i = 0; i < R; ++i) {
         for (uint32_t j = 0; j < C; ++j) {
-          cur_mat_out[stride*(j * R + i)] = cur_mat_in[stride*(i * C + j)];
+          cur_mat_out[stride * (j * R + i)] = cur_mat_in[stride * (i * C + j)];
         }
       }
       cur_mat_in += (columns_batch ? 1 : total_elements_one_mat);
       cur_mat_out += (columns_batch ? 1 : total_elements_one_mat);
     }
   } else {
-    run(s_reference_target, (is_in_place? h_inout.get() : h_out_ref.get()), VERBOSE /*=measure*/, "transpose", ITERS);
+    run(s_reference_target, (is_in_place ? h_inout.get() : h_out_ref.get()), VERBOSE /*=measure*/, "transpose", ITERS);
   }
 
-  run(s_main_target, (is_in_place? h_inout.get() : h_out_main.get()), VERBOSE /*=measure*/, "transpose", ITERS);
+  run(s_main_target, (is_in_place ? h_inout.get() : h_out_main.get()), VERBOSE /*=measure*/, "transpose", ITERS);
   if (is_in_place) {
     ASSERT_EQ(0, memcmp(h_inout.get(), h_out_ref.get(), total_size * sizeof(TypeParam)));
   } else {
-  // std::cout << "h_out_main:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << h_out_main[i] << ", "; } std::cout <<h_out_main[total_size-1]<<"]"<< std::endl;
-  // std::cout << " h_out_ref:\t["; for (int i = 0; i < total_size-1; i++) { std::cout <<  h_out_ref[i] << ", "; } std::cout << h_out_ref[total_size-1]<<"]"<< std::endl;
+    // std::cout << "h_out_main:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << h_out_main[i] << ", "; }
+    // std::cout <<h_out_main[total_size-1]<<"]"<< std::endl; std::cout << " h_out_ref:\t["; for (int i = 0; i <
+    // total_size-1; i++) { std::cout <<  h_out_ref[i] << ", "; } std::cout << h_out_ref[total_size-1]<<"]"<< std::endl;
     ASSERT_EQ(0, memcmp(h_out_main.get(), h_out_ref.get(), total_size * sizeof(TypeParam)));
-  // }}//for loop TODO SHANIE - remove this
+    // }}//for loop TODO SHANIE - remove this
   }
 }
 
@@ -561,7 +576,6 @@ TYPED_TEST(FieldApiTest, bitReverse)
     END_TIMER(BIT_REVERSE, oss.str().c_str(), measure);
   };
 
-
   // // Option 1: Initialize each input vector in the batch with the same ascending values
   // for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
   //   for (uint32_t i = 0; i < N; i++) {
@@ -579,27 +593,27 @@ TYPED_TEST(FieldApiTest, bitReverse)
   // }
 
   // Option 3: Initialize the entire input array with random values
-  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size);
-
+  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
 
   // Reference implementation
   if (!s_is_cuda_registered) {
     uint64_t logn = 0;
     uint64_t temp = N;
     while (temp > 1) {
-        temp >>= 1;
-        logn++;
+      temp >>= 1;
+      logn++;
     }
-    //BIT REVERSE FUNCTION
+    // BIT REVERSE FUNCTION
     for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
       for (uint64_t i = 0; i < N; i++) {
         int rev = 0;
         for (int j = 0; j < logn; ++j) {
           if (i & (1 << j)) { rev |= 1 << (logn - 1 - j); }
         }
-        if(columns_batch){
+        if (columns_batch) {
           out_ref[idx_in_batch + batch_size * i] = in_a[idx_in_batch + batch_size * rev];
-          // ICICLE_LOG_DEBUG << "out_ref[" << idx_in_batch + batch_size * i << "] = in_a[" << idx_in_batch + batch_size * rev << "]";
+          // ICICLE_LOG_DEBUG << "out_ref[" << idx_in_batch + batch_size * i << "] = in_a[" << idx_in_batch + batch_size
+          // * rev << "]";
         } else {
           out_ref[idx_in_batch * N + i] = in_a[idx_in_batch * N + rev];
           // ICICLE_LOG_DEBUG << "out_ref[" << idx_in_batch * N + i << "] = in_a[" << idx_in_batch * N + rev << "]";
@@ -607,17 +621,17 @@ TYPED_TEST(FieldApiTest, bitReverse)
       }
     }
   } else {
-    run(s_reference_target, (is_in_place? in_a.get() : out_ref.get()), VERBOSE /*=measure*/, "bit-reverse", 1);
+    run(s_reference_target, (is_in_place ? in_a.get() : out_ref.get()), VERBOSE /*=measure*/, "bit-reverse", 1);
   }
-  run(s_main_target, (is_in_place? in_a.get() : out_main.get()), VERBOSE /*=measure*/, "bit-reverse", 1);
+  run(s_main_target, (is_in_place ? in_a.get() : out_main.get()), VERBOSE /*=measure*/, "bit-reverse", 1);
 
   if (is_in_place) {
     ASSERT_EQ(0, memcmp(in_a.get(), out_ref.get(), N * sizeof(TypeParam)));
   } else {
-    // std::cout << "out_main:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout <<out_main[total_size-1]<<"]"<< std::endl;
-    // std::cout << "out_ref:\t["; for (int i = 0; i < total_size-1; i++) { std::cout <<  out_ref[i] << ", "; } std::cout << out_ref[total_size-1]<<"]"<< std::endl;
+    // std::cout << "out_main:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << out_main[i] << ", "; }
+    // std::cout <<out_main[total_size-1]<<"]"<< std::endl; std::cout << "out_ref:\t["; for (int i = 0; i <
+    // total_size-1; i++) { std::cout <<  out_ref[i] << ", "; } std::cout << out_ref[total_size-1]<<"]"<< std::endl;
     ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
-
   }
 }
 
@@ -629,12 +643,13 @@ TYPED_TEST(FieldApiTest, Slice)
   const uint64_t size_in = 1 << (rand() % 15 + 5);
   const uint64_t offset = rand() % 15;
   const uint64_t stride = rand() % 4 + 1;
-  const uint64_t size_out =  rand() % (((size_in - offset)/stride)-1) + 1;
+  const uint64_t size_out = rand() % (((size_in - offset) / stride) - 1) + 1;
   const int batch_size = 1 << (rand() % 5);
   const bool columns_batch = rand() % 2;
   const int total_size_in = size_in * batch_size;
   const int total_size_out = size_out * batch_size;
-  // ICICLE_LOG_DEBUG << "size_in = " << size_in << ", offset = " << offset << ", stride = " << stride << ", size_out = " << size_out << ", batch_size = " << batch_size << ", columns_batch = " << columns_batch;
+  // ICICLE_LOG_DEBUG << "size_in = " << size_in << ", offset = " << offset << ", stride = " << stride << ", size_out =
+  // " << size_out << ", batch_size = " << batch_size << ", columns_batch = " << columns_batch;
 
   auto in_a = std::make_unique<TypeParam[]>(total_size_in);
   auto out_main = std::make_unique<TypeParam[]>(total_size_out);
@@ -652,7 +667,7 @@ TYPED_TEST(FieldApiTest, Slice)
 
     START_TIMER(SLICE)
     for (int i = 0; i < iters; ++i) {
-      ICICLE_CHECK(slice(in_a.get(), offset ,stride ,size_in , size_out , config, out));
+      ICICLE_CHECK(slice(in_a.get(), offset, stride, size_in, size_out, config, out));
     }
     END_TIMER(SLICE, oss.str().c_str(), measure);
   };
@@ -674,17 +689,16 @@ TYPED_TEST(FieldApiTest, Slice)
   // }
 
   // Option 3: Initialize the entire input array with random values
-  FieldApiTest<TypeParam>::random_samples(in_a.get(),total_size_in);
-
+  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size_in);
 
   // Reference implementation
   if (!s_is_cuda_registered) {
     for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
       for (uint64_t i = 0; i < size_out; i++) {
-        if(columns_batch){
-          out_ref[idx_in_batch + batch_size * i] = in_a[idx_in_batch + batch_size * (offset + i*stride)];
+        if (columns_batch) {
+          out_ref[idx_in_batch + batch_size * i] = in_a[idx_in_batch + batch_size * (offset + i * stride)];
         } else {
-          out_ref[idx_in_batch * size_out + i] = in_a[idx_in_batch * size_in + (offset + i*stride)];
+          out_ref[idx_in_batch * size_out + i] = in_a[idx_in_batch * size_in + (offset + i * stride)];
         }
       }
     }
@@ -692,12 +706,12 @@ TYPED_TEST(FieldApiTest, Slice)
     run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "slice", 1);
   }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "slice", 1);
-  // std::cout << "out_main\t["; for (int i = 0; i < total_size_out-1; i++) { std::cout << out_main[i] << ", "; } std::cout <<out_main[total_size_out-1]<<"]"<< std::endl;
-  // std::cout << "out_ref:\t["; for (int i = 0; i < total_size_out-1; i++) { std::cout <<  out_ref[i] << ", "; } std::cout << out_ref[total_size_out-1]<<"]"<< std::endl;
+  // std::cout << "out_main\t["; for (int i = 0; i < total_size_out-1; i++) { std::cout << out_main[i] << ", "; }
+  // std::cout <<out_main[total_size_out-1]<<"]"<< std::endl; std::cout << "out_ref:\t["; for (int i = 0; i <
+  // total_size_out-1; i++) { std::cout <<  out_ref[i] << ", "; } std::cout << out_ref[total_size_out-1]<<"]"<<
+  // std::endl;
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size_out * sizeof(TypeParam)));
-  }
-
-
+}
 
 TYPED_TEST(FieldApiTest, highestNonZeroIdx)
 {
@@ -728,7 +742,7 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx)
 
     START_TIMER(highestNonZeroIdx)
     for (int i = 0; i < iters; ++i) {
-      ICICLE_CHECK(highest_non_zero_idx(in_a.get(), N , config, out));
+      ICICLE_CHECK(highest_non_zero_idx(in_a.get(), N, config, out));
     }
     END_TIMER(highestNonZeroIdx, oss.str().c_str(), measure);
   };
@@ -737,22 +751,20 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx)
   for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
     if (!s_is_cuda_registered) { out_ref[idx_in_batch] = rand() % N; } // highest_non_zero_idx
     for (uint32_t i = 0; i < N; i++) {
-      if(columns_batch){
+      if (columns_batch) {
         in_a[idx_in_batch + batch_size * i] = TypeParam::from(i == out_ref[idx_in_batch] ? 1 : 0);
       } else {
         in_a[idx_in_batch * N + i] = TypeParam::from(i == out_ref[idx_in_batch] ? 1 : 0);
       }
     }
   }
-  if (s_is_cuda_registered) {
-    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "highest_non_zero_idx", 1);
-  }
+  if (s_is_cuda_registered) { run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "highest_non_zero_idx", 1); }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "highest_non_zero_idx", 1);
-  // std::cout << "out_main:\t["; for (int i = 0; i < batch_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout <<out_main[batch_size-1]<<"]"<< std::endl;
-  // std::cout << "out_ref:\t["; for (int i = 0; i < batch_size-1; i++) { std::cout <<  out_ref[i] << ", "; } std::cout << out_ref[batch_size-1]<<"]"<< std::endl;
+  // std::cout << "out_main:\t["; for (int i = 0; i < batch_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout
+  // <<out_main[batch_size-1]<<"]"<< std::endl; std::cout << "out_ref:\t["; for (int i = 0; i < batch_size-1; i++) {
+  // std::cout <<  out_ref[i] << ", "; } std::cout << out_ref[batch_size-1]<<"]"<< std::endl;
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(TypeParam)));
-  }
-
+}
 
 TYPED_TEST(FieldApiTest, polynomialEval)
 {
@@ -782,7 +794,7 @@ TYPED_TEST(FieldApiTest, polynomialEval)
 
     START_TIMER(polynomialEval)
     for (int i = 0; i < iters; ++i) {
-      ICICLE_CHECK(polynomial_eval(in_coeffs.get(), coeffs_size, in_domain.get(), domain_size , config, out));
+      ICICLE_CHECK(polynomial_eval(in_coeffs.get(), coeffs_size, in_domain.get(), domain_size, config, out));
     }
     END_TIMER(polynomialEval, oss.str().c_str(), measure);
   };
@@ -790,20 +802,22 @@ TYPED_TEST(FieldApiTest, polynomialEval)
   FieldApiTest<TypeParam>::random_samples(in_coeffs.get(), total_coeffs_size);
   FieldApiTest<TypeParam>::random_samples(in_domain.get(), domain_size);
 
-
   // Reference implementation
-  // TODO - Check in comperison with GPU implementation 
+  // TODO - Check in comperison with GPU implementation
 
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "polynomial_eval", 1);
   if (s_is_cuda_registered) {
     run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "polynomial_eval", 1);
-    // std::cout << "out_main:\t["; for (int i = 0; i < total_coeffs_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout <<out_main[total_coeffs_size-1]<<"]"<< std::endl;
-    // std::cout << "out_ref:\t["; for (int i = 0; i < total_coeffs_size-1; i++) { std::cout <<  out_ref[i] << ", "; } std::cout << out_ref[total_coeffs_size-1]<<"]"<< std::endl;
-    ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_coeffs_size * sizeof(TypeParam))); //TODO - Check in comperison with GPU implementation
-  }
-
+    // std::cout << "out_main:\t["; for (int i = 0; i < total_coeffs_size-1; i++) { std::cout << out_main[i] << ", "; }
+    // std::cout <<out_main[total_coeffs_size-1]<<"]"<< std::endl; std::cout << "out_ref:\t["; for (int i = 0; i <
+    // total_coeffs_size-1; i++) { std::cout <<  out_ref[i] << ", "; } std::cout << out_ref[total_coeffs_size-1]<<"]"<<
+    // std::endl;
+    ASSERT_EQ(
+      0, memcmp(
+           out_main.get(), out_ref.get(),
+           total_coeffs_size * sizeof(TypeParam))); // TODO - Check in comperison with GPU implementation
   }
-
+}
 
 TYPED_TEST(FieldApiTest, polynomialDivision)
 {
@@ -821,8 +835,8 @@ TYPED_TEST(FieldApiTest, polynomialDivision)
   const int batch_size = 1 << (rand() % 5);
   const bool columns_batch = rand() % 2;
 
-  const int64_t total_numerator_size = (numerator_deg+1) * batch_size;
-  const int64_t total_denumerator_size = (denumerator_deg+1) * batch_size;
+  const int64_t total_numerator_size = (numerator_deg + 1) * batch_size;
+  const int64_t total_denumerator_size = (denumerator_deg + 1) * batch_size;
   const uint64_t total_q_size = q_size * batch_size;
   const uint64_t total_r_size = r_size * batch_size;
 
@@ -833,62 +847,63 @@ TYPED_TEST(FieldApiTest, polynomialDivision)
   auto q_out_ref = std::make_unique<TypeParam[]>(total_q_size);
   auto r_out_ref = std::make_unique<TypeParam[]>(total_r_size);
 
-  auto run = [&](const std::string& dev_type, TypeParam* q_out, TypeParam* r_out, bool measure, const char* msg, int iters) {
-    Device dev = {dev_type, 0};
-    icicle_set_device(dev);
-    auto config = default_vec_ops_config();
-    config.batch_size = batch_size;
-    config.columns_batch = columns_batch;
-
+  auto run =
+    [&](const std::string& dev_type, TypeParam* q_out, TypeParam* r_out, bool measure, const char* msg, int iters) {
+      Device dev = {dev_type, 0};
+      icicle_set_device(dev);
+      auto config = default_vec_ops_config();
+      config.batch_size = batch_size;
+      config.columns_batch = columns_batch;
 
-    std::ostringstream oss;
-    oss << dev_type << " " << msg;
+      std::ostringstream oss;
+      oss << dev_type << " " << msg;
 
-    START_TIMER(polynomialDivision)
-    for (int i = 0; i < iters; ++i) {
-      ICICLE_CHECK(polynomial_division(numerator.get(), numerator_deg, denumerator.get(), denumerator_deg , q_size, r_size, config, q_out, r_out));
-    }
-    END_TIMER(polynomialDivision, oss.str().c_str(), measure);
-  };
+      START_TIMER(polynomialDivision)
+      for (int i = 0; i < iters; ++i) {
+        ICICLE_CHECK(polynomial_division(
+          numerator.get(), numerator_deg, denumerator.get(), denumerator_deg, q_size, r_size, config, q_out, r_out));
+      }
+      END_TIMER(polynomialDivision, oss.str().c_str(), measure);
+    };
 
   // // Option 1: Initialize input vectors with random values
   // FieldApiTest<TypeParam>::random_samples(numerator.get(), total_numerator_size);
   // FieldApiTest<TypeParam>::random_samples(denumerator.get(), total_denumerator_size);
-  // // Reference implementation 
+  // // Reference implementation
   // TODO - Check in comperison with GPU implementation or implement a general reference implementation
 
   // Option 2: Initialize the numerator and denumerator with chosen example
   //           And the reference implementation for the example
 
   for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
-    if (columns_batch){
+    if (columns_batch) {
       // numerator = 3x^3+4x^2+5
-      numerator[idx_in_batch + 0*batch_size] = TypeParam::from(5);
-      numerator[idx_in_batch + 1*batch_size] = TypeParam::from(0);
-      numerator[idx_in_batch + 2*batch_size] = TypeParam::from(4);
-      numerator[idx_in_batch + 3*batch_size] = TypeParam::from(3);
+      numerator[idx_in_batch + 0 * batch_size] = TypeParam::from(5);
+      numerator[idx_in_batch + 1 * batch_size] = TypeParam::from(0);
+      numerator[idx_in_batch + 2 * batch_size] = TypeParam::from(4);
+      numerator[idx_in_batch + 3 * batch_size] = TypeParam::from(3);
       // denumerator = x^2-1
-      denumerator[idx_in_batch + 0*batch_size] = TypeParam::from(0) - TypeParam::from(1);
-      denumerator[idx_in_batch + 1*batch_size] = TypeParam::from(0);
-      denumerator[idx_in_batch + 2*batch_size] = TypeParam::from(1);
+      denumerator[idx_in_batch + 0 * batch_size] = TypeParam::from(0) - TypeParam::from(1);
+      denumerator[idx_in_batch + 1 * batch_size] = TypeParam::from(0);
+      denumerator[idx_in_batch + 2 * batch_size] = TypeParam::from(1);
       if (!s_is_cuda_registered) {
         // q_out_ref = 3x+4
-        q_out_ref[idx_in_batch + 0*batch_size] = TypeParam::from(4);
-        q_out_ref[idx_in_batch + 1*batch_size] = TypeParam::from(3);
+        q_out_ref[idx_in_batch + 0 * batch_size] = TypeParam::from(4);
+        q_out_ref[idx_in_batch + 1 * batch_size] = TypeParam::from(3);
         // r_out_ref = 3x+9
-        r_out_ref[idx_in_batch + 0*batch_size] = TypeParam::from(9);
-        r_out_ref[idx_in_batch + 1*batch_size] = TypeParam::from(3);
+        r_out_ref[idx_in_batch + 0 * batch_size] = TypeParam::from(9);
+        r_out_ref[idx_in_batch + 1 * batch_size] = TypeParam::from(3);
       }
     } else {
       // numerator = 3x^3+4x^2+5
-      numerator[idx_in_batch * (numerator_deg+1) + 0] = TypeParam::from(5);
-      numerator[idx_in_batch * (numerator_deg+1) + 1] = TypeParam::from(0);
-      numerator[idx_in_batch * (numerator_deg+1) + 2] = TypeParam::from(4);
-      numerator[idx_in_batch * (numerator_deg+1) + 3] = TypeParam::from(3);
+      numerator[idx_in_batch * (numerator_deg + 1) + 0] = TypeParam::from(5);
+      numerator[idx_in_batch * (numerator_deg + 1) + 1] = TypeParam::from(0);
+      numerator[idx_in_batch * (numerator_deg + 1) + 2] = TypeParam::from(4);
+      numerator[idx_in_batch * (numerator_deg + 1) + 3] = TypeParam::from(3);
       // denumerator = x^2-1
-      denumerator[idx_in_batch * (denumerator_deg+1) + 0] = TypeParam::from(0) - TypeParam::from(1);
-      denumerator[idx_in_batch * (denumerator_deg+1) + 1] = TypeParam::from(0);
-      denumerator[idx_in_batch * (denumerator_deg+1) + 2] = TypeParam::from(1);
+      denumerator[idx_in_batch * (denumerator_deg + 1) + 0] = TypeParam::from(0) - TypeParam::from(1);
+      denumerator[idx_in_batch * (denumerator_deg + 1) + 1] = TypeParam::from(0);
+      denumerator[idx_in_batch * (denumerator_deg + 1) + 2] = TypeParam::from(1);
       if (!s_is_cuda_registered) {
         // q_out_ref = 3x+4
         q_out_ref[idx_in_batch * q_size + 0] = TypeParam::from(4);
@@ -903,10 +918,13 @@ TYPED_TEST(FieldApiTest, polynomialDivision)
   if (s_is_cuda_registered) {
     run(s_reference_target, q_out_ref.get(), r_out_ref.get(), VERBOSE /*=measure*/, "polynomial_division", 1);
   }
-  // std::cout << "numerator:\t["; for (int i = 0; i < total_numerator_size-1; i++) { std::cout << numerator[i] << ", "; } std::cout <<numerator[total_numerator_size-1]<<"]"<< std::endl;
-  // std::cout << "denumerator:\t["; for (int i = 0; i < total_denumerator_size-1; i++) { std::cout << denumerator[i] << ", "; } std::cout <<denumerator[total_denumerator_size-1]<<"]"<< std::endl;
-  // std::cout << "q_out_ref:\t["; for (int i = 0; i < total_q_size-1; i++) { std::cout <<  q_out_ref[i] << ", "; } std::cout << q_out_ref[total_q_size-1]<<"]"<< std::endl;
-  // std::cout << "r_out_ref:\t["; for (int i = 0; i < total_r_size-1; i++) { std::cout <<  r_out_ref[i] << ", "; } std::cout << r_out_ref[total_r_size-1]<<"]"<< std::endl;
+  // std::cout << "numerator:\t["; for (int i = 0; i < total_numerator_size-1; i++) { std::cout << numerator[i] << ", ";
+  // } std::cout <<numerator[total_numerator_size-1]<<"]"<< std::endl; std::cout << "denumerator:\t["; for (int i = 0; i
+  // < total_denumerator_size-1; i++) { std::cout << denumerator[i] << ", "; } std::cout
+  // <<denumerator[total_denumerator_size-1]<<"]"<< std::endl; std::cout << "q_out_ref:\t["; for (int i = 0; i <
+  // total_q_size-1; i++) { std::cout <<  q_out_ref[i] << ", "; } std::cout << q_out_ref[total_q_size-1]<<"]"<<
+  // std::endl; std::cout << "r_out_ref:\t["; for (int i = 0; i < total_r_size-1; i++) { std::cout <<  r_out_ref[i] <<
+  // ", "; } std::cout << r_out_ref[total_r_size-1]<<"]"<< std::endl;
   run(s_main_target, q_out_main.get(), r_out_main.get(), VERBOSE /*=measure*/, "polynomial_division", 1);
   ASSERT_EQ(0, memcmp(q_out_main.get(), q_out_ref.get(), total_q_size * sizeof(TypeParam)));
   ASSERT_EQ(0, memcmp(r_out_main.get(), r_out_ref.get(), total_r_size * sizeof(TypeParam)));
@@ -979,8 +997,6 @@ TYPED_TEST(FieldApiTest, polynomialDivision)
 //   ICICLE_LOG_INFO << "NTT test: log_coset_stride=" << log_coset_stride;
 //   ICICLE_LOG_INFO << "NTT test: coset_gen=" << coset_gen;
 
-
-
 //   const int total_size = N * batch_size;
 //   auto scalars = std::make_unique<TypeParam[]>(total_size);
 //   FieldApiTest<TypeParam>::random_samples(scalars.get(), total_size);
@@ -1038,8 +1054,10 @@ TYPED_TEST(FieldApiTest, polynomialDivision)
 //   // run(s_main_target, out_main.get(), "ntt", false /*=measure*/, 0 /*=iters*/); // warmup
 //   run(s_reference_target, out_ref.get(), "V3ntt", VERBOSE /*=measure*/, 10 /*=iters*/);
 //   run(s_main_target, out_main.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/);
-//   // std::cout << "left:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout <<out_main[total_size-1]<<"]"<< std::endl;
-//   // std::cout << "right:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << out_ref[i] << ", "; } std::cout <<out_ref[total_size-1]<<"]"<< std::endl;
+//   // std::cout << "left:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout
+//   <<out_main[total_size-1]<<"]"<< std::endl;
+//   // std::cout << "right:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << out_ref[i] << ", "; } std::cout
+//   <<out_ref[total_size-1]<<"]"<< std::endl;
 
 //   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t)));
 // }

From 98ca917be846702184e68e00edd4d6b19453b1a5 Mon Sep 17 00:00:00 2001
From: hadaringonyama <hadar@ingonyama.com>
Date: Mon, 21 Oct 2024 14:33:56 +0300
Subject: [PATCH 11/43] vectorVectorOps passes

---
 icicle/tests/test_field_api.cpp | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index 50d4b0d8f..a717faf33 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -28,7 +28,8 @@ static bool VERBOSE = true;
 static int ITERS = 1;
 static inline std::string s_main_target;
 static inline std::string s_reference_target;
-static const bool s_is_cuda_registered = is_device_registered("CUDA");
+// static const bool s_is_cuda_registered = is_device_registered("CUDA");
+bool s_is_cuda_registered;
 
 template <typename T>
 class FieldApiTest : public ::testing::Test
@@ -42,6 +43,7 @@ class FieldApiTest : public ::testing::Test
 #endif
     icicle_load_backend_from_env_or_default();
 
+    s_is_cuda_registered = is_device_registered("CUDA");
     if (!s_is_cuda_registered) { ICICLE_LOG_ERROR << "CUDA device not found. Testing CPU vs reference (on cpu)"; }
     s_main_target = s_is_cuda_registered ? "CUDA" : "CPU";
     s_reference_target = "CPU";
@@ -93,13 +95,18 @@ TYPED_TEST(FieldApiTest, vectorVectorOps)
   srand(seed);
   ICICLE_LOG_DEBUG << "seed = " << seed;
   const uint64_t N = 1 << (rand() % 15 + 3);
+  // const uint64_t N = 1 << (3);
   const int batch_size = 1 << (rand() % 5);
+  // const int batch_size = 2;
   const bool columns_batch = rand() % 2;
   const int total_size = N * batch_size;
   auto in_a = std::make_unique<TypeParam[]>(total_size);
   auto in_b = std::make_unique<TypeParam[]>(total_size);
   auto out_main = std::make_unique<TypeParam[]>(total_size);
   auto out_ref = std::make_unique<TypeParam[]>(total_size);
+  ICICLE_LOG_DEBUG << "N = " << N;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
 
   auto vector_accumulate_wrapper =
     [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) {
@@ -152,14 +159,19 @@ TYPED_TEST(FieldApiTest, vectorVectorOps)
   // // accumulate
   FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
   FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
-  if (!s_is_cuda_registered) {
+  // if (!s_is_cuda_registered) {
     for (int i = 0; i < total_size; i++) {
       out_ref[i] = in_a[i] + in_b[i];
     }
-  } else {
-    run(s_reference_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS);
-  }
+  // } else {
+    // run(s_reference_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS);
+  // }
   run(s_main_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS);
+
+  // for (int i = 0; i < total_size; i++) {
+  //   ICICLE_LOG_DEBUG << i << ", " << in_a[i] << ", " << in_b[i] << ", " << out_ref[i];
+  // }
+
   ASSERT_EQ(0, memcmp(in_a.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 
   // // sub

From 0c6bc9aaffde999272f2306562bfafe8e2f0bef3 Mon Sep 17 00:00:00 2001
From: hadaringonyama <hadar@ingonyama.com>
Date: Tue, 22 Oct 2024 15:14:43 +0300
Subject: [PATCH 12/43] mont + scalars passing

---
 icicle/tests/test_field_api.cpp | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index a717faf33..8f607e4d4 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -348,16 +348,24 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
 {
   int seed = time(0);
   srand(seed);
-  // ICICLE_LOG_DEBUG << "seed = " << seed;
+  ICICLE_LOG_DEBUG << "seed = " << seed;
   const uint64_t N = 1 << (rand() % 15 + 3);
+  // const uint64_t N = 1 << 3;
   const int batch_size = 1 << (rand() % 5);
+  // const int batch_size = 2;
   const bool columns_batch = rand() % 2;
+  // const bool columns_batch = 0;
   const bool use_single_scalar = rand() % 2;
+  // const bool use_single_scalar = 1;
   const int total_size = N * batch_size;
   auto scalar_a = std::make_unique<TypeParam[]>(use_single_scalar ? 1 : batch_size);
   auto in_b = std::make_unique<TypeParam[]>(total_size);
   auto out_main = std::make_unique<TypeParam[]>(total_size);
   auto out_ref = std::make_unique<TypeParam[]>(total_size);
+  ICICLE_LOG_DEBUG << "N = " << N;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
+  ICICLE_LOG_DEBUG << "use_single_scalar = " << use_single_scalar;
 
   auto vector_accumulate_wrapper =
     [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) {
@@ -398,9 +406,16 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
     run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_add_vec<TypeParam>, "scalar add vec", ITERS);
   }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_add_vec<TypeParam>, "scalar add vec", ITERS);
+
+  
+  // ICICLE_LOG_DEBUG << scalar_a[0] << ", ";
+  // for (int i = 0; i < total_size; i++) {
+  //   ICICLE_LOG_DEBUG << i << ", " << in_b[i] << ", " << out_main[i] << ", " << out_ref[i];
+  // }
+  
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 
-  // // scalar sub vec
+  // scalar sub vec
   FieldApiTest<TypeParam>::random_samples(scalar_a.get(), (use_single_scalar ? 1 : batch_size));
   FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
 

From 32e262b2f34c9870692674b7fee0c8abc9c18984 Mon Sep 17 00:00:00 2001
From: hadaringonyama <hadar@ingonyama.com>
Date: Wed, 23 Oct 2024 18:20:46 +0300
Subject: [PATCH 13/43] bitrev passes

---
 icicle/tests/test_field_api.cpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index 8f607e4d4..96f09ed82 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -569,17 +569,17 @@ TYPED_TEST(FieldApiTest, bitReverse)
 {
   int seed = time(0);
   srand(seed);
-  // ICICLE_LOG_DEBUG << "seed = " << seed;
+  ICICLE_LOG_DEBUG << "seed = " << seed;
   const uint64_t N = 1 << (rand() % 15 + 3);
   const int batch_size = 1 << (rand() % 5);
   const bool columns_batch = rand() % 2;
   const bool is_in_place = rand() % 2;
   const int total_size = N * batch_size;
 
-  // const uint64_t N = 1 << (2);
+  // const uint64_t N = 1 << (3);
   // const int batch_size = 1 << (1);
-  // const bool columns_batch = true;
-  // const bool is_in_place = true;
+  // const bool columns_batch = 1;
+  // const bool is_in_place = 0;
   // const int total_size = N * batch_size;
 
   auto in_a = std::make_unique<TypeParam[]>(total_size);
@@ -623,7 +623,7 @@ TYPED_TEST(FieldApiTest, bitReverse)
   FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
 
   // Reference implementation
-  if (!s_is_cuda_registered) {
+  if (!s_is_cuda_registered || is_in_place) {
     uint64_t logn = 0;
     uint64_t temp = N;
     while (temp > 1) {
@@ -652,6 +652,10 @@ TYPED_TEST(FieldApiTest, bitReverse)
   }
   run(s_main_target, (is_in_place ? in_a.get() : out_main.get()), VERBOSE /*=measure*/, "bit-reverse", 1);
 
+  //   for (int i = 0; i < total_size; i++) {
+  //   ICICLE_LOG_DEBUG << i << ", " << in_a[i] << ", " << out_main[i] << ", " << out_ref[i];
+  // }
+
   if (is_in_place) {
     ASSERT_EQ(0, memcmp(in_a.get(), out_ref.get(), N * sizeof(TypeParam)));
   } else {

From e8e1799f63f5f978a913e533a62ce11ecbc981c2 Mon Sep 17 00:00:00 2001
From: hadaringonyama <hadar@ingonyama.com>
Date: Mon, 28 Oct 2024 13:20:43 +0200
Subject: [PATCH 14/43] slice passes

---
 icicle/tests/test_field_api.cpp | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index 96f09ed82..bacb44540 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -670,13 +670,25 @@ TYPED_TEST(FieldApiTest, Slice)
 {
   int seed = time(0);
   srand(seed);
-  // ICICLE_LOG_DEBUG << "seed = " << seed;
+  ICICLE_LOG_DEBUG << "seed = " << seed;
   const uint64_t size_in = 1 << (rand() % 15 + 5);
   const uint64_t offset = rand() % 15;
   const uint64_t stride = rand() % 4 + 1;
   const uint64_t size_out = rand() % (((size_in - offset) / stride) - 1) + 1;
   const int batch_size = 1 << (rand() % 5);
   const bool columns_batch = rand() % 2;
+
+  // const uint64_t size_in = 1 << (20);
+  // const uint64_t offset = 97;
+  // const uint64_t stride = 6;
+  // const uint64_t size_out = (((size_in - offset) / stride) - 1) - 100;
+
+  // ICICLE_LOG_DEBUG << size_in <<", "<< offset<<", "<<stride<<", "<<size_out;
+
+  // const int batch_size = 50;
+  // const bool columns_batch = 1;
+
+
   const int total_size_in = size_in * batch_size;
   const int total_size_out = size_out * batch_size;
   // ICICLE_LOG_DEBUG << "size_in = " << size_in << ", offset = " << offset << ", stride = " << stride << ", size_out =
@@ -741,6 +753,11 @@ TYPED_TEST(FieldApiTest, Slice)
   // std::cout <<out_main[total_size_out-1]<<"]"<< std::endl; std::cout << "out_ref:\t["; for (int i = 0; i <
   // total_size_out-1; i++) { std::cout <<  out_ref[i] << ", "; } std::cout << out_ref[total_size_out-1]<<"]"<<
   // std::endl;
+
+  //   for (int i = 0; i < total_size_in; i++) {
+  //   ICICLE_LOG_DEBUG << i << ", " << in_a[i] << ", " << out_main[i] << ", " << out_ref[i];
+  // }
+
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size_out * sizeof(TypeParam)));
 }
 

From 1d1f84ef2b6ca29c076b2074ae844320d5c8ed56 Mon Sep 17 00:00:00 2001
From: hadaringonyama <hadar@ingonyama.com>
Date: Tue, 29 Oct 2024 12:24:07 +0200
Subject: [PATCH 15/43] slice passes

---
 icicle/tests/test_field_api.cpp | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index bacb44540..cc2927b7f 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -462,7 +462,14 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
     1 << (rand() % 8 + 2); // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2
   const int batch_size = 1 << (rand() % 4);
   const bool columns_batch = rand() % 2;
-  const bool is_in_place = rand() % 2;
+  const bool is_in_place = s_is_cuda_registered? 0 : rand() % 2; //TODO - fix inplace (Hadar: I'm not sure we should support it)
+
+  // const int R = 4; // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2
+  // const int C = 3;
+  // const int batch_size = 1 << (1);
+  // const bool columns_batch = 1;
+  // const bool is_in_place = 1;
+
   // ICICLE_LOG_DEBUG << "R = " << R << ", C = " << C << ", batch_size = " << batch_size << ", columns_batch = " <<
   // columns_batch << ", is_in_place = " << is_in_place; //TODO SHANIE - remove this
   const int total_size = R * C * batch_size;
@@ -488,9 +495,9 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
     TypeParam *d_in, *d_out;
     if (!device_props.using_host_memory) {
       icicle_create_stream(&config.stream);
-      icicle_malloc_async((void**)&d_in, R * C * sizeof(TypeParam), config.stream);
-      icicle_malloc_async((void**)&d_out, R * C * sizeof(TypeParam), config.stream);
-      icicle_copy_to_device_async(d_in, h_inout.get(), R * C * sizeof(TypeParam), config.stream);
+      icicle_malloc_async((void**)&d_in, total_size * sizeof(TypeParam), config.stream);
+      icicle_malloc_async((void**)&d_out, total_size * sizeof(TypeParam), config.stream);
+      icicle_copy_to_device_async(d_in, h_inout.get(), total_size * sizeof(TypeParam), config.stream);
 
       config.is_a_on_device = true;
       config.is_result_on_device = true;
@@ -507,7 +514,7 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
     END_TIMER(TRANSPOSE, oss.str().c_str(), measure);
 
     if (!device_props.using_host_memory) {
-      icicle_copy_to_host_async(h_out, d_out, R * C * sizeof(TypeParam), config.stream);
+      icicle_copy_to_host_async(h_out, d_out, total_size * sizeof(TypeParam), config.stream);
       icicle_stream_synchronize(config.stream);
       icicle_free_async(d_in, config.stream);
       icicle_free_async(d_out, config.stream);
@@ -554,6 +561,12 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
   }
 
   run(s_main_target, (is_in_place ? h_inout.get() : h_out_main.get()), VERBOSE /*=measure*/, "transpose", ITERS);
+
+   // ICICLE_LOG_DEBUG << scalar_a[0] << ", ";
+  // for (int i = 0; i < total_size; i++) {
+  //   ICICLE_LOG_DEBUG << i << ", " << h_inout[i] << ", " << h_out_main[i] << ", " << h_out_ref[i];
+  // }
+
   if (is_in_place) {
     ASSERT_EQ(0, memcmp(h_inout.get(), h_out_ref.get(), total_size * sizeof(TypeParam)));
   } else {

From 0c609bf10e192d50e39637cbae165b22ae441072 Mon Sep 17 00:00:00 2001
From: hadaringonyama <hadar@ingonyama.com>
Date: Tue, 29 Oct 2024 15:48:02 +0200
Subject: [PATCH 16/43] reduction passes

---
 icicle/tests/test_field_api.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index cc2927b7f..a1bffc7b8 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -279,6 +279,12 @@ TYPED_TEST(FieldApiTest, VectorReduceOps)
   const int batch_size = 1 << (rand() % 5);
   const bool columns_batch = rand() % 2;
   const int total_size = N * batch_size;
+
+  // const uint64_t N = 1 << (rand() % 15 + 3);
+  // const int batch_size = 1 << 3;
+  // const bool columns_batch = 1;
+  // const int total_size = N * batch_size;
+
   auto in_a = std::make_unique<TypeParam[]>(total_size);
   auto out_main = std::make_unique<TypeParam[]>(batch_size);
   auto out_ref = std::make_unique<TypeParam[]>(batch_size);

From dca2e5bf21b3e95a579f46a2acf58b730179409b Mon Sep 17 00:00:00 2001
From: hadaringonyama <hadar@ingonyama.com>
Date: Wed, 30 Oct 2024 10:28:18 +0200
Subject: [PATCH 17/43] fix scalar columns batch

---
 icicle/tests/test_field_api.cpp | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index a1bffc7b8..00985a25d 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -274,14 +274,14 @@ TYPED_TEST(FieldApiTest, VectorReduceOps)
 {
   int seed = time(0);
   srand(seed);
-  // ICICLE_LOG_DEBUG << "seed = " << seed;
+  ICICLE_LOG_DEBUG << "seed = " << seed;
   const uint64_t N = 1 << (rand() % 15 + 3);
   const int batch_size = 1 << (rand() % 5);
   const bool columns_batch = rand() % 2;
   const int total_size = N * batch_size;
 
-  // const uint64_t N = 1 << (rand() % 15 + 3);
-  // const int batch_size = 1 << 3;
+  // const uint64_t N = 1 << (20);
+  // const int batch_size = 1 << 4;
   // const bool columns_batch = 1;
   // const int total_size = N * batch_size;
 
@@ -356,13 +356,15 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
   srand(seed);
   ICICLE_LOG_DEBUG << "seed = " << seed;
   const uint64_t N = 1 << (rand() % 15 + 3);
-  // const uint64_t N = 1 << 3;
   const int batch_size = 1 << (rand() % 5);
-  // const int batch_size = 2;
   const bool columns_batch = rand() % 2;
-  // const bool columns_batch = 0;
   const bool use_single_scalar = rand() % 2;
-  // const bool use_single_scalar = 1;
+
+  // const uint64_t N = 1 << (4);
+  // const int batch_size = 7;
+  // const bool columns_batch = 1;
+  // const bool use_single_scalar = 0;
+
   const int total_size = N * batch_size;
   auto scalar_a = std::make_unique<TypeParam[]>(use_single_scalar ? 1 : batch_size);
   auto in_b = std::make_unique<TypeParam[]>(total_size);
@@ -415,6 +417,7 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
 
   
   // ICICLE_LOG_DEBUG << scalar_a[0] << ", ";
+  // ICICLE_LOG_DEBUG << scalar_a[1] << ", ";
   // for (int i = 0; i < total_size; i++) {
   //   ICICLE_LOG_DEBUG << i << ", " << in_b[i] << ", " << out_main[i] << ", " << out_ref[i];
   // }

From 0728a069352f4bd8eca8d5a00bb6554a17dd54ba Mon Sep 17 00:00:00 2001
From: hadaringonyama <hadar@ingonyama.com>
Date: Wed, 30 Oct 2024 12:20:25 +0200
Subject: [PATCH 18/43] remove same scalar bool

---
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp  | 20 ++++------
 .../include/icicle/backend/vec_ops_backend.h  |  1 -
 .../default_backend/default_poly_backend.h    |  8 ++--
 icicle/include/icicle/vec_ops.h               | 21 ++--------
 icicle/src/vec_ops.cpp                        | 18 +++------
 icicle/tests/test_field_api.cpp               | 39 ++++++++++---------
 6 files changed, 41 insertions(+), 66 deletions(-)

diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index 74678fc83..7133bec8c 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -405,22 +405,21 @@ eIcicleError cpu_scalar_vector_op(
   const T* scalar_a,
   const T* vec_b,
   uint64_t size,
-  bool use_single_scalar,
   const VecOpsConfig& config,
   T* output)
 {
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
-  const uint64_t total_nof_operations = use_single_scalar ? size * config.batch_size : size;
-  const uint32_t stride = (!use_single_scalar && config.columns_batch) ? config.batch_size : 1;
-  for (uint32_t idx_in_batch = 0; idx_in_batch < (use_single_scalar ? 1 : config.batch_size); idx_in_batch++) {
+  const uint64_t total_nof_operations = size;
+  const uint32_t stride = config.columns_batch ? config.batch_size : 1;
+  for (uint32_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
     for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) {
       VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
       task_p->send_2ops_task(
         op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), scalar_a + idx_in_batch,
-        (!use_single_scalar && config.columns_batch) ? vec_b + idx_in_batch + i * config.batch_size
+        config.columns_batch ? vec_b + idx_in_batch + i * config.batch_size
                                                      : vec_b + idx_in_batch * size + i,
         stride,
-        (!use_single_scalar && config.columns_batch) ? output + idx_in_batch + i * config.batch_size
+        config.columns_batch ? output + idx_in_batch + i * config.batch_size
                                                      : output + idx_in_batch * size + i);
     }
   }
@@ -595,11 +594,10 @@ eIcicleError cpu_scalar_add(
   const T* scalar_a,
   const T* vec_b,
   uint64_t size,
-  bool use_single_scalar,
   const VecOpsConfig& config,
   T* output)
 {
-  return cpu_scalar_vector_op(VecOperation::SCALAR_ADD_VEC, scalar_a, vec_b, size, use_single_scalar, config, output);
+  return cpu_scalar_vector_op(VecOperation::SCALAR_ADD_VEC, scalar_a, vec_b, size, config, output);
 }
 
 REGISTER_SCALAR_ADD_VEC_BACKEND("CPU", cpu_scalar_add<scalar_t>);
@@ -611,11 +609,10 @@ eIcicleError cpu_scalar_sub(
   const T* scalar_a,
   const T* vec_b,
   uint64_t size,
-  bool use_single_scalar,
   const VecOpsConfig& config,
   T* output)
 {
-  return cpu_scalar_vector_op(VecOperation::SCALAR_SUB_VEC, scalar_a, vec_b, size, use_single_scalar, config, output);
+  return cpu_scalar_vector_op(VecOperation::SCALAR_SUB_VEC, scalar_a, vec_b, size, config, output);
 }
 
 REGISTER_SCALAR_SUB_VEC_BACKEND("CPU", cpu_scalar_sub<scalar_t>);
@@ -627,11 +624,10 @@ eIcicleError cpu_scalar_mul(
   const T* scalar_a,
   const T* vec_b,
   uint64_t size,
-  bool use_single_scalar,
   const VecOpsConfig& config,
   T* output)
 {
-  return cpu_scalar_vector_op(VecOperation::SCALAR_MUL_VEC, scalar_a, vec_b, size, use_single_scalar, config, output);
+  return cpu_scalar_vector_op(VecOperation::SCALAR_MUL_VEC, scalar_a, vec_b, size, config, output);
 }
 
 REGISTER_SCALAR_MUL_VEC_BACKEND("CPU", cpu_scalar_mul<scalar_t>);
diff --git a/icicle/include/icicle/backend/vec_ops_backend.h b/icicle/include/icicle/backend/vec_ops_backend.h
index 58909e1f4..1adfe89f8 100644
--- a/icicle/include/icicle/backend/vec_ops_backend.h
+++ b/icicle/include/icicle/backend/vec_ops_backend.h
@@ -34,7 +34,6 @@ namespace icicle {
     const scalar_t* scalar_a,
     const scalar_t* vec_b,
     uint64_t size,
-    bool use_single_scalar,
     const VecOpsConfig& config,
     scalar_t* output)>;
 
diff --git a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
index c7e53b218..12468cb53 100644
--- a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
+++ b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
@@ -126,7 +126,7 @@ namespace icicle {
         C zero = C::zero();
         config.is_a_on_device = false;
         ICICLE_CHECK(
-          scalar_sub_vec(&zero, b_mem_p + min_op_size, b->get_nof_elements() - min_op_size, true, config, res_mem_p));
+          scalar_sub_vec(&zero, b_mem_p + min_op_size, b->get_nof_elements() - min_op_size, config, res_mem_p));
       }
     }
 
@@ -173,7 +173,7 @@ namespace icicle {
       config.is_result_on_device = true;
       config.is_async = true;
       config.stream = m_stream;
-      icicle::scalar_mul_vec(&scalar, p_elements_p, N, true, config, out_evals_p);
+      icicle::scalar_mul_vec(&scalar, p_elements_p, N, config, out_evals_p);
     }
 
     void multiply_with_padding(PolyContext c, PolyContext a, PolyContext b)
@@ -409,7 +409,7 @@ namespace icicle {
       config.is_async = true;
       config.stream = m_stream;
       icicle::scalar_mul_vec(
-        &v_coset_eval, numerator_evals_reversed_p + N /*second half is the reversed coset*/, N, true, config,
+        &v_coset_eval, numerator_evals_reversed_p + N /*second half is the reversed coset*/, N, config,
         out_evals_reversed_p);
 
       // INTT back from reversed evals on coset to coeffs
@@ -450,7 +450,7 @@ namespace icicle {
       config.is_result_on_device = true;
       config.is_async = true;
       config.stream = m_stream;
-      icicle::scalar_mul_vec(&v_coset_eval, out_evals_reversed_p, N, true, config, out_evals_reversed_p);
+      icicle::scalar_mul_vec(&v_coset_eval, out_evals_reversed_p, N, config, out_evals_reversed_p);
 
       // (3) INTT back from coset to coeffs
       ntt_config.are_inputs_on_device = true;
diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h
index 524cbcdc5..132d6cb69 100644
--- a/icicle/include/icicle/vec_ops.h
+++ b/icicle/include/icicle/vec_ops.h
@@ -220,17 +220,12 @@ namespace icicle {
    *
    * @tparam T Type of the elements in the vector and the scalar.
    * @param scalar_a Pointer to the input scalar(s).
-   *                 - If `use_single_scalar` is `true`, this should point to a single scalar value.
-   *                 - If `use_single_scalar` is `false`, this should point to an array of scalars with length
-   * `config.batch_size`.
    * @param vec_b Pointer to the input vector(s).
    *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
    *              - The layout depends on `config.columns_batch`:
    *                - If `false`, vectors are stored contiguously.
    *                - If `true`, vectors are stored as columns in a 2D array.
    * @param size Number of elements in a vector.
-   * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of
-   * scalars (`false`).
    * @param config Configuration for the operation.
    * @param output Pointer to the output vector(s) where the results will be stored.
    * @return eIcicleError Error code indicating success or failure.
@@ -238,24 +233,19 @@ namespace icicle {
    */
   template <typename T>
   eIcicleError scalar_add_vec(
-    const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output);
+    const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
 
   /**
    * @brief Subtracts each element of a vector from a scalar, elementwise (res[i]=scalar-vec[i]).
    *
    * @tparam T Type of the elements in the vector and the scalar.
    * @param scalar_a Pointer to Input scalar(s).
-   *                 - If `use_single_scalar` is `true`, this should point to a single scalar value.
-   *                 - If `use_single_scalar` is `false`, this should point to an array of scalars with length
-   * `config.batch_size`.
    * @param vec_b Pointer to the input vector(s).
    *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
    *              - The layout depends on `config.columns_batch`:
    *                - If `false`, vectors are stored contiguously.
    *                - If `true`, vectors are stored as columns in a 2D array.
    * @param size Number of elements in a vector.
-   * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of
-   * scalars (`false`).
    * @param config Configuration for the operation.
    * @param output Pointer to the output vector(s) where the results will be stored.
    * @return eIcicleError Error code indicating success or failure.
@@ -263,31 +253,26 @@ namespace icicle {
    */
   template <typename T>
   eIcicleError scalar_sub_vec(
-    const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output);
+    const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
 
   /**
    * @brief Multiplies each element of a vector by a scalar.
    *
    * @tparam T Type of the elements in the vector and the scalar.
    * @param scalar_a Pointer to Input scalar(s).
-   *                 - If `use_single_scalar` is `true`, this should point to a single scalar value.
-   *                 - If `use_single_scalar` is `false`, this should point to an array of scalars with length
-   * `config.batch_size`.
    * @param vec_b Pointer to the input vector(s).
    *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
    *              - The layout depends on `config.columns_batch`:
    *                - If `false`, vectors are stored contiguously.
    *                - If `true`, vectors are stored as columns in a 2D array.
    * @param size Number of elements in a vector.
-   * @param use_single_scalar Flag indicating whether to use a single scalar for all vectors (`true`) or an array of
-   * scalars (`false`).
    * @param config Configuration for the operation.
    * @param output Pointer to the output vector(s) where the results will be stored.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
   eIcicleError scalar_mul_vec(
-    const T* scalar_a, const T* vec_b, uint64_t size, bool use_single_scalar, const VecOpsConfig& config, T* output);
+    const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
 
   // Matrix operations
 
diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp
index c97fe3e1f..5eb4ea49e 100644
--- a/icicle/src/vec_ops.cpp
+++ b/icicle/src/vec_ops.cpp
@@ -187,11 +187,10 @@ namespace icicle {
     const scalar_t* scalar_a,
     const scalar_t* vec_b,
     uint64_t size,
-    bool use_single_scalar,
     const VecOpsConfig* config,
     scalar_t* output)
   {
-    return ScalarAddDispatcher::execute(scalar_a, vec_b, size, use_single_scalar, *config, output);
+    return ScalarAddDispatcher::execute(scalar_a, vec_b, size, *config, output);
   }
 
   template <>
@@ -199,11 +198,10 @@ namespace icicle {
     const scalar_t* scalar_a,
     const scalar_t* vec_b,
     uint64_t size,
-    bool use_single_scalar,
     const VecOpsConfig& config,
     scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, scalar_add_vec)(scalar_a, vec_b, size, use_single_scalar, &config, output);
+    return CONCAT_EXPAND(FIELD, scalar_add_vec)(scalar_a, vec_b, size, &config, output);
   }
 
   /*********************************** (Scalar - Vector) ELEMENT WISE ***********************************/
@@ -213,11 +211,10 @@ namespace icicle {
     const scalar_t* scalar_a,
     const scalar_t* vec_b,
     uint64_t size,
-    bool use_single_scalar,
     const VecOpsConfig* config,
     scalar_t* output)
   {
-    return ScalarSubDispatcher::execute(scalar_a, vec_b, size, use_single_scalar, *config, output);
+    return ScalarSubDispatcher::execute(scalar_a, vec_b, size, *config, output);
   }
 
   template <>
@@ -225,11 +222,10 @@ namespace icicle {
     const scalar_t* scalar_a,
     const scalar_t* vec_b,
     uint64_t size,
-    bool use_single_scalar,
     const VecOpsConfig& config,
     scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, scalar_sub_vec)(scalar_a, vec_b, size, use_single_scalar, &config, output);
+    return CONCAT_EXPAND(FIELD, scalar_sub_vec)(scalar_a, vec_b, size, &config, output);
   }
   /*********************************** MUL BY SCALAR ***********************************/
   ICICLE_DISPATCHER_INST(ScalarMulDispatcher, scalar_mul_vec, scalarVectorOpImpl);
@@ -238,11 +234,10 @@ namespace icicle {
     const scalar_t* scalar_a,
     const scalar_t* vec_b,
     uint64_t size,
-    bool use_single_scalar,
     const VecOpsConfig* config,
     scalar_t* output)
   {
-    return ScalarMulDispatcher::execute(scalar_a, vec_b, size, use_single_scalar, *config, output);
+    return ScalarMulDispatcher::execute(scalar_a, vec_b, size, *config, output);
   }
 
   template <>
@@ -250,11 +245,10 @@ namespace icicle {
     const scalar_t* scalar_a,
     const scalar_t* vec_b,
     uint64_t size,
-    bool use_single_scalar,
     const VecOpsConfig& config,
     scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, scalar_mul_vec)(scalar_a, vec_b, size, use_single_scalar, &config, output);
+    return CONCAT_EXPAND(FIELD, scalar_mul_vec)(scalar_a, vec_b, size, &config, output);
   }
 
   /*********************************** CONVERT MONTGOMERY ***********************************/
diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index 00985a25d..4c079a80c 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -358,22 +358,19 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
   const uint64_t N = 1 << (rand() % 15 + 3);
   const int batch_size = 1 << (rand() % 5);
   const bool columns_batch = rand() % 2;
-  const bool use_single_scalar = rand() % 2;
 
   // const uint64_t N = 1 << (4);
   // const int batch_size = 7;
   // const bool columns_batch = 1;
-  // const bool use_single_scalar = 0;
 
   const int total_size = N * batch_size;
-  auto scalar_a = std::make_unique<TypeParam[]>(use_single_scalar ? 1 : batch_size);
+  auto scalar_a = std::make_unique<TypeParam[]>(batch_size);
   auto in_b = std::make_unique<TypeParam[]>(total_size);
   auto out_main = std::make_unique<TypeParam[]>(total_size);
   auto out_ref = std::make_unique<TypeParam[]>(total_size);
   ICICLE_LOG_DEBUG << "N = " << N;
   ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
   ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
-  ICICLE_LOG_DEBUG << "use_single_scalar = " << use_single_scalar;
 
   auto vector_accumulate_wrapper =
     [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) {
@@ -393,13 +390,13 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
 
       START_TIMER(VECADD_sync)
       for (int i = 0; i < iters; ++i) {
-        ICICLE_CHECK(vec_op_func(scalar_a.get(), in_b.get(), N, use_single_scalar, config, out));
+        ICICLE_CHECK(vec_op_func(scalar_a.get(), in_b.get(), N, config, out));
       }
       END_TIMER(VECADD_sync, oss.str().c_str(), measure);
     };
 
   // // scalar add vec
-  FieldApiTest<TypeParam>::random_samples(scalar_a.get(), (use_single_scalar ? 1 : batch_size));
+  FieldApiTest<TypeParam>::random_samples(scalar_a.get(), batch_size);
   FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
 
   // reference
@@ -407,7 +404,7 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
     for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
       for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) {
         uint64_t idx_b = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N;
-        out_ref[idx_b] = (use_single_scalar ? scalar_a[0] : scalar_a[idx_in_batch]) + in_b[idx_b];
+        out_ref[idx_b] = (scalar_a[idx_in_batch]) + in_b[idx_b];
       }
     }
   } else {
@@ -425,14 +422,14 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 
   // scalar sub vec
-  FieldApiTest<TypeParam>::random_samples(scalar_a.get(), (use_single_scalar ? 1 : batch_size));
+  FieldApiTest<TypeParam>::random_samples(scalar_a.get(), batch_size);
   FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
 
   if (!s_is_cuda_registered) {
     for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
       for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) {
         uint64_t idx_b = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N;
-        out_ref[idx_b] = (use_single_scalar ? scalar_a[0] : scalar_a[idx_in_batch]) - in_b[idx_b];
+        out_ref[idx_b] = (scalar_a[idx_in_batch]) - in_b[idx_b];
       }
     }
   } else {
@@ -443,14 +440,14 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 
   // // scalar mul vec
-  FieldApiTest<TypeParam>::random_samples(scalar_a.get(), (use_single_scalar ? 1 : batch_size));
+  FieldApiTest<TypeParam>::random_samples(scalar_a.get(), batch_size);
   FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
 
   if (!s_is_cuda_registered) {
     for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
       for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) {
         uint64_t idx_b = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N;
-        out_ref[idx_b] = (use_single_scalar ? scalar_a[0] : scalar_a[idx_in_batch]) * in_b[idx_b];
+        out_ref[idx_b] = (scalar_a[idx_in_batch]) * in_b[idx_b];
       }
     }
   } else {
@@ -788,12 +785,12 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx)
   int seed = time(0);
   srand(seed);
   // ICICLE_LOG_DEBUG << "seed = " << seed;
-  const uint64_t N = 1 << (rand() % 15 + 3);
-  const int batch_size = 1 << (rand() % 5);
-  const bool columns_batch = rand() % 2;
-  // const uint64_t N = 1 << (3);
-  // const int batch_size = 1 << (1);
-  // const bool columns_batch = true;
+  // const uint64_t N = 1 << (rand() % 15 + 3);
+  // const int batch_size = 1 << (rand() % 5);
+  // const bool columns_batch = rand() % 2;
+  const uint64_t N = 1 << (8);
+  const int batch_size = 1 << (3);
+  const bool columns_batch = 1;
   const int total_size = N * batch_size;
 
   auto in_a = std::make_unique<TypeParam[]>(total_size);
@@ -819,7 +816,7 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx)
 
   // Initialize each entire vector with 1 at a random index. The highest non-zero index is the index with 1
   for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
-    if (!s_is_cuda_registered) { out_ref[idx_in_batch] = rand() % N; } // highest_non_zero_idx
+    if (!s_is_cuda_registered) { out_ref[idx_in_batch] = static_cast<int64_t>(rand() % N); } // highest_non_zero_idx
     for (uint32_t i = 0; i < N; i++) {
       if (columns_batch) {
         in_a[idx_in_batch + batch_size * i] = TypeParam::from(i == out_ref[idx_in_batch] ? 1 : 0);
@@ -833,7 +830,11 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx)
   // std::cout << "out_main:\t["; for (int i = 0; i < batch_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout
   // <<out_main[batch_size-1]<<"]"<< std::endl; std::cout << "out_ref:\t["; for (int i = 0; i < batch_size-1; i++) {
   // std::cout <<  out_ref[i] << ", "; } std::cout << out_ref[batch_size-1]<<"]"<< std::endl;
-  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(TypeParam)));
+    for (int i = 0; i < batch_size; i++) {
+    ICICLE_LOG_DEBUG << i << ", " << out_main[i] << ", " << out_ref[i];
+  }
+
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(int64_t)));
 }
 
 TYPED_TEST(FieldApiTest, polynomialEval)

From 2590df02051cdeaede042f359582cb004a1259f8 Mon Sep 17 00:00:00 2001
From: hadaringonyama <hadar@ingonyama.com>
Date: Wed, 30 Oct 2024 13:24:36 +0200
Subject: [PATCH 19/43] fix API

---
 icicle/include/icicle/vec_ops.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h
index 132d6cb69..291462d6b 100644
--- a/icicle/include/icicle/vec_ops.h
+++ b/icicle/include/icicle/vec_ops.h
@@ -417,8 +417,10 @@ namespace icicle {
   eIcicleError polynomial_division(
     const T* numerator,
     int64_t numerator_deg,
+    uint64_t numerator_size,
     const T* denumerator,
     int64_t denumerator_deg,
+    uint64_t denumerator_size,
     uint64_t q_size,
     uint64_t r_size,
     const VecOpsConfig& config,

From 2fd1facf999e814ce2a043120ac498e1e66b081b Mon Sep 17 00:00:00 2001
From: hadaringonyama <hadar@ingonyama.com>
Date: Wed, 30 Oct 2024 13:40:19 +0200
Subject: [PATCH 20/43] fix API

---
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 2 ++
 icicle/src/vec_ops.cpp                       | 8 ++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index 7133bec8c..f27ab5600 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -910,8 +910,10 @@ eIcicleError cpu_poly_divide(
   const Device& device,
   const T* numerator,
   int64_t numerator_deg,
+  uint64_t numerator_size,
   const T* denumerator,
   int64_t denumerator_deg,
+  uint64_t denumerator_size,
   uint64_t q_size,
   uint64_t r_size,
   const VecOpsConfig& config,
diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp
index 5eb4ea49e..c722c595a 100644
--- a/icicle/src/vec_ops.cpp
+++ b/icicle/src/vec_ops.cpp
@@ -425,8 +425,10 @@ namespace icicle {
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, poly_division)(
     const scalar_t* numerator,
     int64_t numerator_deg,
+    uint64_t numerator_size,
     const scalar_t* denumerator,
     int64_t denumerator_deg,
+    uint64_t denumerator_size,
     uint64_t q_size,
     uint64_t r_size,
     const VecOpsConfig* config,
@@ -434,15 +436,17 @@ namespace icicle {
     scalar_t* r_out /*OUT*/)
   {
     return ScalarPolyDivDispatcher::execute(
-      numerator, numerator_deg, denumerator, denumerator_deg, q_size, r_size, *config, q_out, r_out);
+      numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, *config, q_out, r_out);
   }
 
   template <>
   eIcicleError polynomial_division(
     const scalar_t* numerator,
     int64_t numerator_deg,
+    uint64_t numerator_size,
     const scalar_t* denumerator,
     int64_t denumerator_deg,
+    uint64_t denumerator_size,
     uint64_t q_size,
     uint64_t r_size,
     const VecOpsConfig& config,
@@ -450,7 +454,7 @@ namespace icicle {
     scalar_t* r_out /*OUT*/)
   {
     return CONCAT_EXPAND(FIELD, poly_division)(
-      numerator, numerator_deg, denumerator, denumerator_deg, q_size, r_size, &config, q_out, r_out);
+      numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, &config, q_out, r_out);
   }
 
 } // namespace icicle
\ No newline at end of file

From 1bd7c0501867c1f49a0c55753fe46199e3b758c8 Mon Sep 17 00:00:00 2001
From: hadaringonyama <hadar@ingonyama.com>
Date: Wed, 30 Oct 2024 23:52:31 +0200
Subject: [PATCH 21/43]  non zero passes

---
 .../include/icicle/backend/vec_ops_backend.h  |  2 ++
 .../default_backend/default_poly_backend.h    |  2 +-
 icicle/include/icicle/vec_ops.h               | 10 ++++++++
 icicle/src/vec_ops.cpp                        |  6 ++---
 icicle/tests/test_field_api.cpp               | 24 +++++++++----------
 5 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/icicle/include/icicle/backend/vec_ops_backend.h b/icicle/include/icicle/backend/vec_ops_backend.h
index 1adfe89f8..04f7ed73f 100644
--- a/icicle/include/icicle/backend/vec_ops_backend.h
+++ b/icicle/include/icicle/backend/vec_ops_backend.h
@@ -74,8 +74,10 @@ namespace icicle {
     const Device& device,
     const scalar_t* numerator,
     int64_t numerator_deg,
+    uint64_t numerator_size,
     const scalar_t* denumerator,
     int64_t denumerator_deg,
+    uint64_t denumerator_size,
     uint64_t q_size,
     uint64_t r_size,
     const VecOpsConfig& config,
diff --git a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
index 12468cb53..a42c87317 100644
--- a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
+++ b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
@@ -278,7 +278,7 @@ namespace icicle {
       config.is_result_on_device = true;
 
       ICICLE_CHECK(icicle::polynomial_division(
-        a_coeffs, deg_a, b_coeffs, deg_b, deg_a - deg_b + 1, a_N, config, Q_coeffs, R_coeffs));
+        a_coeffs, deg_a, a_N, b_coeffs, deg_b, b_N, deg_a - deg_b + 1, a_N, config, Q_coeffs, R_coeffs));
     }
 
     void quotient(PolyContext Q, PolyContext op_a, PolyContext op_b) override
diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h
index 291462d6b..bf5eab324 100644
--- a/icicle/include/icicle/vec_ops.h
+++ b/icicle/include/icicle/vec_ops.h
@@ -342,6 +342,16 @@ namespace icicle {
     const VecOpsConfig& config,
     T* vec_out);
 
+  // Deprecated slice API
+  template <typename T>
+  eIcicleError slice(
+    const T* vec_in,
+    uint64_t offset,
+    uint64_t stride,    
+    uint64_t size,
+    const VecOpsConfig& config,
+    T* vec_out);
+
   /**
    * @brief Finds the highest non-zero index in a vector or batch of vectors.
    *
diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp
index c722c595a..4606a1f8c 100644
--- a/icicle/src/vec_ops.cpp
+++ b/icicle/src/vec_ops.cpp
@@ -431,12 +431,12 @@ namespace icicle {
     uint64_t denumerator_size,
     uint64_t q_size,
     uint64_t r_size,
-    const VecOpsConfig* config,
+    const VecOpsConfig& config,
     scalar_t* q_out /*OUT*/,
     scalar_t* r_out /*OUT*/)
   {
     return ScalarPolyDivDispatcher::execute(
-      numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, *config, q_out, r_out);
+      numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, config, q_out, r_out);
   }
 
   template <>
@@ -454,7 +454,7 @@ namespace icicle {
     scalar_t* r_out /*OUT*/)
   {
     return CONCAT_EXPAND(FIELD, poly_division)(
-      numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, &config, q_out, r_out);
+      numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, config, q_out, r_out);
   }
 
 } // namespace icicle
\ No newline at end of file
diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index 4c079a80c..95d673f8a 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -784,13 +784,13 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx)
 {
   int seed = time(0);
   srand(seed);
-  // ICICLE_LOG_DEBUG << "seed = " << seed;
-  // const uint64_t N = 1 << (rand() % 15 + 3);
-  // const int batch_size = 1 << (rand() % 5);
-  // const bool columns_batch = rand() % 2;
-  const uint64_t N = 1 << (8);
-  const int batch_size = 1 << (3);
-  const bool columns_batch = 1;
+  ICICLE_LOG_DEBUG << "seed = " << seed;
+  const uint64_t N = 1 << (rand() % 15 + 3);
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
+  // const uint64_t N = 1 << (20);
+  // const int batch_size = 1 << (0);
+  // const bool columns_batch = 0;
   const int total_size = N * batch_size;
 
   auto in_a = std::make_unique<TypeParam[]>(total_size);
@@ -816,7 +816,7 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx)
 
   // Initialize each entire vector with 1 at a random index. The highest non-zero index is the index with 1
   for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
-    if (!s_is_cuda_registered) { out_ref[idx_in_batch] = static_cast<int64_t>(rand() % N); } // highest_non_zero_idx
+    out_ref[idx_in_batch] = static_cast<int64_t>(rand() % N); // highest_non_zero_idx
     for (uint32_t i = 0; i < N; i++) {
       if (columns_batch) {
         in_a[idx_in_batch + batch_size * i] = TypeParam::from(i == out_ref[idx_in_batch] ? 1 : 0);
@@ -830,9 +830,9 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx)
   // std::cout << "out_main:\t["; for (int i = 0; i < batch_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout
   // <<out_main[batch_size-1]<<"]"<< std::endl; std::cout << "out_ref:\t["; for (int i = 0; i < batch_size-1; i++) {
   // std::cout <<  out_ref[i] << ", "; } std::cout << out_ref[batch_size-1]<<"]"<< std::endl;
-    for (int i = 0; i < batch_size; i++) {
-    ICICLE_LOG_DEBUG << i << ", " << out_main[i] << ", " << out_ref[i];
-  }
+  //   for (int i = 0; i < batch_size; i++) {
+  //   ICICLE_LOG_DEBUG << i << ", " << out_main[i] << ", " << out_ref[i];
+  // }
 
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(int64_t)));
 }
@@ -932,7 +932,7 @@ TYPED_TEST(FieldApiTest, polynomialDivision)
       START_TIMER(polynomialDivision)
       for (int i = 0; i < iters; ++i) {
         ICICLE_CHECK(polynomial_division(
-          numerator.get(), numerator_deg, denumerator.get(), denumerator_deg, q_size, r_size, config, q_out, r_out));
+          numerator.get(), numerator_deg, total_numerator_size, denumerator.get(), denumerator_deg, total_denumerator_size, q_size, r_size, config, q_out, r_out));
       }
       END_TIMER(polynomialDivision, oss.str().c_str(), measure);
     };

From 0016149986206da8f2d7c39f702f51b55b1a4c20 Mon Sep 17 00:00:00 2001
From: Yuval Shekel <yshekel@gmail.com>
Date: Thu, 31 Oct 2024 13:19:31 +0200
Subject: [PATCH 22/43] slice and poly_dev apis deprecated use new ones with
 warning

---
 icicle/include/icicle/vec_ops.h | 33 ++++++++-----
 icicle/src/vec_ops.cpp          | 85 ++++++++++++++++++++-------------
 2 files changed, 73 insertions(+), 45 deletions(-)

diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h
index bf5eab324..2868aa682 100644
--- a/icicle/include/icicle/vec_ops.h
+++ b/icicle/include/icicle/vec_ops.h
@@ -232,8 +232,7 @@ namespace icicle {
    * @note To subtract a scalar from each element of a vector - use scalar_add_vec with negative scalar.
    */
   template <typename T>
-  eIcicleError scalar_add_vec(
-    const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
+  eIcicleError scalar_add_vec(const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
 
   /**
    * @brief Subtracts each element of a vector from a scalar, elementwise (res[i]=scalar-vec[i]).
@@ -252,8 +251,7 @@ namespace icicle {
    * @note To subtract a scalar from each element of a vector - use scalar_add_vec with negative scalar.
    */
   template <typename T>
-  eIcicleError scalar_sub_vec(
-    const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
+  eIcicleError scalar_sub_vec(const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
 
   /**
    * @brief Multiplies each element of a vector by a scalar.
@@ -271,8 +269,7 @@ namespace icicle {
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
-  eIcicleError scalar_mul_vec(
-    const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
+  eIcicleError scalar_mul_vec(const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
 
   // Matrix operations
 
@@ -344,13 +341,8 @@ namespace icicle {
 
   // Deprecated slice API
   template <typename T>
-  eIcicleError slice(
-    const T* vec_in,
-    uint64_t offset,
-    uint64_t stride,    
-    uint64_t size,
-    const VecOpsConfig& config,
-    T* vec_out);
+  eIcicleError
+  slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size_out, const VecOpsConfig& config, T* vec_out);
 
   /**
    * @brief Finds the highest non-zero index in a vector or batch of vectors.
@@ -406,9 +398,11 @@ namespace icicle {
    * contiguously.
    *                  - If `config.columns_batch` is `true`, coefficients are interleaved.
    * @param numerator_deg Degree of the numerator polynomial.
+   * @param numerator_size size (number of T elements) in numerator vec
    * @param denominator Pointer to the array of coefficients of the denominator polynomial(s).
    *                  - Storage layout is similar to `numerator`.
    * @param denominator_deg Degree of the denominator polynomial.
+   * @param denominator_size size (number of T elements) in denumerator vec
    * @param config Configuration for the operation.
    * @param q_size Size of the quotient array for one polynomial.
    * @param r_size Size of the remainder array.
@@ -437,4 +431,17 @@ namespace icicle {
     T* q_out /*OUT*/,
     T* r_out /*OUT*/);
 
+  // deprecated API
+  template <typename T>
+  eIcicleError polynomial_division(
+    const T* numerator,
+    int64_t numerator_deg,
+    const T* denumerator,
+    int64_t denumerator_deg,
+    const VecOpsConfig& config,
+    T* q_out /*OUT*/,
+    uint64_t q_size,
+    T* r_out /*OUT*/,
+    uint64_t r_size);
+
 } // namespace icicle
\ No newline at end of file
diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp
index 4606a1f8c..c8b867470 100644
--- a/icicle/src/vec_ops.cpp
+++ b/icicle/src/vec_ops.cpp
@@ -184,22 +184,14 @@ namespace icicle {
   ICICLE_DISPATCHER_INST(ScalarAddDispatcher, scalar_add_vec, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_add_vec)(
-    const scalar_t* scalar_a,
-    const scalar_t* vec_b,
-    uint64_t size,
-    const VecOpsConfig* config,
-    scalar_t* output)
+    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output)
   {
     return ScalarAddDispatcher::execute(scalar_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError scalar_add_vec(
-    const scalar_t* scalar_a,
-    const scalar_t* vec_b,
-    uint64_t size,
-    const VecOpsConfig& config,
-    scalar_t* output)
+    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output)
   {
     return CONCAT_EXPAND(FIELD, scalar_add_vec)(scalar_a, vec_b, size, &config, output);
   }
@@ -208,22 +200,14 @@ namespace icicle {
   ICICLE_DISPATCHER_INST(ScalarSubDispatcher, scalar_sub_vec, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_sub_vec)(
-    const scalar_t* scalar_a,
-    const scalar_t* vec_b,
-    uint64_t size,
-    const VecOpsConfig* config,
-    scalar_t* output)
+    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output)
   {
     return ScalarSubDispatcher::execute(scalar_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError scalar_sub_vec(
-    const scalar_t* scalar_a,
-    const scalar_t* vec_b,
-    uint64_t size,
-    const VecOpsConfig& config,
-    scalar_t* output)
+    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output)
   {
     return CONCAT_EXPAND(FIELD, scalar_sub_vec)(scalar_a, vec_b, size, &config, output);
   }
@@ -231,22 +215,14 @@ namespace icicle {
   ICICLE_DISPATCHER_INST(ScalarMulDispatcher, scalar_mul_vec, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_mul_vec)(
-    const scalar_t* scalar_a,
-    const scalar_t* vec_b,
-    uint64_t size,
-    const VecOpsConfig* config,
-    scalar_t* output)
+    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output)
   {
     return ScalarMulDispatcher::execute(scalar_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError scalar_mul_vec(
-    const scalar_t* scalar_a,
-    const scalar_t* vec_b,
-    uint64_t size,
-    const VecOpsConfig& config,
-    scalar_t* output)
+    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output)
   {
     return CONCAT_EXPAND(FIELD, scalar_mul_vec)(scalar_a, vec_b, size, &config, output);
   }
@@ -347,6 +323,25 @@ namespace icicle {
     return CONCAT_EXPAND(FIELD, slice)(input, offset, stride, size_in, size_out, &config, output);
   }
 
+  // Deprecated API
+  template <>
+  eIcicleError slice(
+    const scalar_t* input,
+    uint64_t offset,
+    uint64_t stride,
+    uint64_t size_out,
+    const VecOpsConfig& config,
+    scalar_t* output)
+  {
+    const auto size_in = offset + stride * (size_out - 1) + 1; // input should be at least that large
+    ICICLE_LOG_WARNING << "slice api is deprecated and replace with new api. Use new slice api instead";
+    if (config.batch_size != 1) {
+      ICICLE_LOG_ERROR << "deprecated slice API does not support batch";
+      return eIcicleError::INVALID_ARGUMENT;
+    }
+    return slice(input, offset, stride, size_in, size_out, config, output);
+  }
+
 #ifdef EXT_FIELD
   ICICLE_DISPATCHER_INST(ExtFieldSliceDispatcher, extension_slice, extFieldSliceOpImpl)
 
@@ -436,7 +431,8 @@ namespace icicle {
     scalar_t* r_out /*OUT*/)
   {
     return ScalarPolyDivDispatcher::execute(
-      numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, config, q_out, r_out);
+      numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, config,
+      q_out, r_out);
   }
 
   template <>
@@ -454,7 +450,32 @@ namespace icicle {
     scalar_t* r_out /*OUT*/)
   {
     return CONCAT_EXPAND(FIELD, poly_division)(
-      numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, config, q_out, r_out);
+      numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, config,
+      q_out, r_out);
+  }
+
+  // Deprecated API
+  template <>
+  eIcicleError polynomial_division(
+    const scalar_t* numerator,
+    int64_t numerator_deg,
+    const scalar_t* denumerator,
+    int64_t denumerator_deg,
+    const VecOpsConfig& config,
+    scalar_t* q_out /*OUT*/,
+    uint64_t q_size,
+    scalar_t* r_out /*OUT*/,
+    uint64_t r_size)
+  {
+    ICICLE_LOG_WARNING
+      << "polynomial_division api is deprecated and replace with new api. Use new polynomial_division api instead";
+    if (config.batch_size != 1) {
+      ICICLE_LOG_ERROR << "deprecated polynomial_division API does not support batch";
+      return eIcicleError::INVALID_ARGUMENT;
+    }
+    return polynomial_division(
+      numerator, numerator_deg, numerator_deg + 1, denumerator, denumerator_deg, denumerator_deg + 1, q_size, r_size,
+      config, q_out, r_out);
   }
 
 } // namespace icicle
\ No newline at end of file

From 916618ce079d6b6fb6860ee705737c04798fd0d6 Mon Sep 17 00:00:00 2001
From: hadaringonyama <hadar@ingonyama.com>
Date: Thu, 31 Oct 2024 15:58:56 +0200
Subject: [PATCH 23/43] poly eval WIP

---
 icicle/tests/test_field_api.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index 95d673f8a..a4d20ce54 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -841,11 +841,17 @@ TYPED_TEST(FieldApiTest, polynomialEval)
 {
   int seed = time(0);
   srand(seed);
-  // ICICLE_LOG_DEBUG << "seed = " << seed;
+  ICICLE_LOG_DEBUG << "seed = " << seed;
+  // const uint64_t coeffs_size = 1 << (rand() % 10 + 4);
+  // const uint64_t domain_size = 1 << (rand() % 8 + 2);
+  // const int batch_size = 1 << (rand() % 5);
+  // const bool columns_batch = rand() % 2;
+  
   const uint64_t coeffs_size = 1 << (rand() % 10 + 4);
   const uint64_t domain_size = 1 << (rand() % 8 + 2);
-  const int batch_size = 1 << (rand() % 5);
-  const bool columns_batch = rand() % 2;
+  const int batch_size = 1 << (0);
+  const bool columns_batch = 0;
+  
   const int total_coeffs_size = coeffs_size * batch_size;
 
   auto in_coeffs = std::make_unique<TypeParam[]>(total_coeffs_size);

From f033bdbda2ca1aa6e53c23b6c2e0cd615271a6e4 Mon Sep 17 00:00:00 2001
From: hadaringonyama <hadar@ingonyama.com>
Date: Thu, 31 Oct 2024 17:47:09 +0200
Subject: [PATCH 24/43] poly eval passes

---
 icicle/tests/test_field_api.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index a4d20ce54..82ab117b7 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -842,15 +842,15 @@ TYPED_TEST(FieldApiTest, polynomialEval)
   int seed = time(0);
   srand(seed);
   ICICLE_LOG_DEBUG << "seed = " << seed;
-  // const uint64_t coeffs_size = 1 << (rand() % 10 + 4);
-  // const uint64_t domain_size = 1 << (rand() % 8 + 2);
-  // const int batch_size = 1 << (rand() % 5);
-  // const bool columns_batch = rand() % 2;
-  
   const uint64_t coeffs_size = 1 << (rand() % 10 + 4);
   const uint64_t domain_size = 1 << (rand() % 8 + 2);
-  const int batch_size = 1 << (0);
-  const bool columns_batch = 0;
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
+  
+  // const uint64_t coeffs_size = 1 << (3);
+  // const uint64_t domain_size = 3;
+  // const int batch_size = 1 << (1);
+  // const bool columns_batch = 1;
   
   const int total_coeffs_size = coeffs_size * batch_size;
 

From 35d2e2384faa731b9651c7beda8165b50bf24150 Mon Sep 17 00:00:00 2001
From: hadaringonyama <hadar@ingonyama.com>
Date: Thu, 31 Oct 2024 20:57:09 +0200
Subject: [PATCH 25/43] fix types +

---
 icicle/tests/test_field_api.cpp | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index 82ab117b7..49fe79709 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -909,7 +909,8 @@ TYPED_TEST(FieldApiTest, polynomialDivision)
   const int64_t denumerator_deg = 2;
   const uint64_t q_size = 2;
   const uint64_t r_size = 4;
-  const int batch_size = 1 << (rand() % 5);
+  // const int batch_size = 1 << (rand() % 5);
+  const int batch_size = 1;
   const bool columns_batch = rand() % 2;
 
   const int64_t total_numerator_size = (numerator_deg + 1) * batch_size;
@@ -995,13 +996,13 @@ TYPED_TEST(FieldApiTest, polynomialDivision)
   if (s_is_cuda_registered) {
     run(s_reference_target, q_out_ref.get(), r_out_ref.get(), VERBOSE /*=measure*/, "polynomial_division", 1);
   }
-  // std::cout << "numerator:\t["; for (int i = 0; i < total_numerator_size-1; i++) { std::cout << numerator[i] << ", ";
-  // } std::cout <<numerator[total_numerator_size-1]<<"]"<< std::endl; std::cout << "denumerator:\t["; for (int i = 0; i
-  // < total_denumerator_size-1; i++) { std::cout << denumerator[i] << ", "; } std::cout
-  // <<denumerator[total_denumerator_size-1]<<"]"<< std::endl; std::cout << "q_out_ref:\t["; for (int i = 0; i <
-  // total_q_size-1; i++) { std::cout <<  q_out_ref[i] << ", "; } std::cout << q_out_ref[total_q_size-1]<<"]"<<
-  // std::endl; std::cout << "r_out_ref:\t["; for (int i = 0; i < total_r_size-1; i++) { std::cout <<  r_out_ref[i] <<
-  // ", "; } std::cout << r_out_ref[total_r_size-1]<<"]"<< std::endl;
+  std::cout << "numerator:\t["; for (int i = 0; i < total_numerator_size-1; i++) { std::cout << numerator[i] << ", ";
+  } std::cout <<numerator[total_numerator_size-1]<<"]"<< std::endl; std::cout << "denumerator:\t["; for (int i = 0; i
+  < total_denumerator_size-1; i++) { std::cout << denumerator[i] << ", "; } std::cout
+  <<denumerator[total_denumerator_size-1]<<"]"<< std::endl; std::cout << "q_out_ref:\t["; for (int i = 0; i <
+  total_q_size-1; i++) { std::cout <<  q_out_ref[i] << ", "; } std::cout << q_out_ref[total_q_size-1]<<"]"<<
+  std::endl; std::cout << "r_out_ref:\t["; for (int i = 0; i < total_r_size-1; i++) { std::cout <<  r_out_ref[i] <<
+  ", "; } std::cout << r_out_ref[total_r_size-1]<<"]"<< std::endl;
   run(s_main_target, q_out_main.get(), r_out_main.get(), VERBOSE /*=measure*/, "polynomial_division", 1);
   ASSERT_EQ(0, memcmp(q_out_main.get(), q_out_ref.get(), total_q_size * sizeof(TypeParam)));
   ASSERT_EQ(0, memcmp(r_out_main.get(), r_out_ref.get(), total_r_size * sizeof(TypeParam)));

From ecc054da58078ad143973458629f5087d8e1b0f2 Mon Sep 17 00:00:00 2001
From: hadaringonyama <hadar@ingonyama.com>
Date: Thu, 31 Oct 2024 21:17:53 +0200
Subject: [PATCH 26/43] tidy up

---
 icicle/tests/test_field_api.cpp | 410 +++++++++++++-------------------
 1 file changed, 168 insertions(+), 242 deletions(-)

diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index 49fe79709..e43c60fff 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -17,6 +17,8 @@
 using namespace field_config;
 using namespace icicle;
 
+//TODO - add tests that test different configurations of data on device or on host.
+
 using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
 #define END_TIMER(timer, msg, enable)                                                                                  \
@@ -28,7 +30,6 @@ static bool VERBOSE = true;
 static int ITERS = 1;
 static inline std::string s_main_target;
 static inline std::string s_reference_target;
-// static const bool s_is_cuda_registered = is_device_registered("CUDA");
 bool s_is_cuda_registered;
 
 template <typename T>
@@ -95,18 +96,18 @@ TYPED_TEST(FieldApiTest, vectorVectorOps)
   srand(seed);
   ICICLE_LOG_DEBUG << "seed = " << seed;
   const uint64_t N = 1 << (rand() % 15 + 3);
-  // const uint64_t N = 1 << (3);
   const int batch_size = 1 << (rand() % 5);
-  // const int batch_size = 2;
   const bool columns_batch = rand() % 2;
+  
+  ICICLE_LOG_DEBUG << "N = " << N;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
+  
   const int total_size = N * batch_size;
   auto in_a = std::make_unique<TypeParam[]>(total_size);
   auto in_b = std::make_unique<TypeParam[]>(total_size);
   auto out_main = std::make_unique<TypeParam[]>(total_size);
   auto out_ref = std::make_unique<TypeParam[]>(total_size);
-  ICICLE_LOG_DEBUG << "N = " << N;
-  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
-  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
 
   auto vector_accumulate_wrapper =
     [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) {
@@ -131,19 +132,7 @@ TYPED_TEST(FieldApiTest, vectorVectorOps)
       END_TIMER(VECADD_sync, oss.str().c_str(), measure);
     };
 
-  // warmup
-  // run(s_reference_target, out_ref.get(), false /*=measure*/, 16 /*=iters*/);
-  // run(s_main_target, out_main.get(), false /*=measure*/, 1 /*=iters*/);
-
-  // warmup
-  // run(s_reference_target, out_ref.get(), false /*=measure*/, 16 /*=iters*/);
-  // run(s_main_target, out_main.get(), false /*=measure*/, 1 /*=iters*/);
-
-  // Element-wise vector operations
-  // If config.batch_size>1, (columns_batch=true or false) the operation is done element-wise anyway, so it doesn't
-  // affect the test
-
-  // // add
+  // add
   FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
   FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
   if (!s_is_cuda_registered) {
@@ -156,25 +145,17 @@ TYPED_TEST(FieldApiTest, vectorVectorOps)
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_add<TypeParam>, "vector add", ITERS);
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 
-  // // accumulate
+  // accumulate
   FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
   FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
-  // if (!s_is_cuda_registered) {
-    for (int i = 0; i < total_size; i++) {
-      out_ref[i] = in_a[i] + in_b[i];
-    }
-  // } else {
-    // run(s_reference_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS);
-  // }
+  for (int i = 0; i < total_size; i++) { //TODO - compare gpu against cpu with inplace operations?
+    out_ref[i] = in_a[i] + in_b[i];
+  }
   run(s_main_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS);
 
-  // for (int i = 0; i < total_size; i++) {
-  //   ICICLE_LOG_DEBUG << i << ", " << in_a[i] << ", " << in_b[i] << ", " << out_ref[i];
-  // }
-
   ASSERT_EQ(0, memcmp(in_a.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 
-  // // sub
+  // sub
   FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
   FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
   if (!s_is_cuda_registered) {
@@ -187,7 +168,7 @@ TYPED_TEST(FieldApiTest, vectorVectorOps)
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sub<TypeParam>, "vector sub", ITERS);
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 
-  // // mul
+  // mul
   FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
   FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
   if (!s_is_cuda_registered) {
@@ -219,11 +200,15 @@ TYPED_TEST(FieldApiTest, montgomeryConversion)
 {
   int seed = time(0);
   srand(seed);
-  // ICICLE_LOG_DEBUG << "seed = " << seed;
+  ICICLE_LOG_DEBUG << "seed = " << seed;
   const uint64_t N = 1 << (rand() % 15 + 3);
   const int batch_size = 1 << (rand() % 5);
   const bool columns_batch = rand() % 2;
   const bool is_to_montgomery = rand() % 2;
+  ICICLE_LOG_DEBUG << "N = " << N;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
+  ICICLE_LOG_DEBUG << "is_to_montgomery = " << is_to_montgomery;
   const int total_size = N * batch_size;
   auto in_a = std::make_unique<TypeParam[]>(total_size);
   auto out_main = std::make_unique<TypeParam[]>(total_size);
@@ -246,10 +231,6 @@ TYPED_TEST(FieldApiTest, montgomeryConversion)
     END_TIMER(MONTGOMERY, oss.str().c_str(), measure);
   };
 
-  // Element-wise operation
-  // If config.batch_size>1, (columns_batch=true or false) the addition is done element-wise anyway, so it doesn't
-  // affect the test
-
   // convert_montgomery
   FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
   // reference
@@ -280,10 +261,9 @@ TYPED_TEST(FieldApiTest, VectorReduceOps)
   const bool columns_batch = rand() % 2;
   const int total_size = N * batch_size;
 
-  // const uint64_t N = 1 << (20);
-  // const int batch_size = 1 << 4;
-  // const bool columns_batch = 1;
-  // const int total_size = N * batch_size;
+  ICICLE_LOG_DEBUG << "N = " << N;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
 
   auto in_a = std::make_unique<TypeParam[]>(total_size);
   auto out_main = std::make_unique<TypeParam[]>(batch_size);
@@ -312,7 +292,7 @@ TYPED_TEST(FieldApiTest, VectorReduceOps)
       END_TIMER(VECADD_sync, oss.str().c_str(), measure);
     };
 
-  // // sum
+  // sum
   FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
   // reference
   for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
@@ -331,7 +311,7 @@ TYPED_TEST(FieldApiTest, VectorReduceOps)
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sum<TypeParam>, "vector sum", ITERS);
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(TypeParam)));
 
-  // // product
+  // product
   FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
   if (!s_is_cuda_registered) {
     for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
@@ -359,9 +339,9 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
   const int batch_size = 1 << (rand() % 5);
   const bool columns_batch = rand() % 2;
 
-  // const uint64_t N = 1 << (4);
-  // const int batch_size = 7;
-  // const bool columns_batch = 1;
+  ICICLE_LOG_DEBUG << "N = " << N;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
 
   const int total_size = N * batch_size;
   auto scalar_a = std::make_unique<TypeParam[]>(batch_size);
@@ -395,7 +375,7 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
       END_TIMER(VECADD_sync, oss.str().c_str(), measure);
     };
 
-  // // scalar add vec
+  // scalar add vec
   FieldApiTest<TypeParam>::random_samples(scalar_a.get(), batch_size);
   FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
 
@@ -411,13 +391,6 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
     run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_add_vec<TypeParam>, "scalar add vec", ITERS);
   }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_add_vec<TypeParam>, "scalar add vec", ITERS);
-
-  
-  // ICICLE_LOG_DEBUG << scalar_a[0] << ", ";
-  // ICICLE_LOG_DEBUG << scalar_a[1] << ", ";
-  // for (int i = 0; i < total_size; i++) {
-  //   ICICLE_LOG_DEBUG << i << ", " << in_b[i] << ", " << out_main[i] << ", " << out_ref[i];
-  // }
   
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 
@@ -439,7 +412,7 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_sub_vec<TypeParam>, "scalar sub vec", ITERS);
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 
-  // // scalar mul vec
+  // scalar mul vec
   FieldApiTest<TypeParam>::random_samples(scalar_a.get(), batch_size);
   FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
 
@@ -461,23 +434,20 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
 {
   int seed = time(0);
   srand(seed);
-  // ICICLE_LOG_DEBUG << "seed = " << seed;
+  ICICLE_LOG_DEBUG << "seed = " << seed;
   const int R =
-    1 << (rand() % 8 + 2); // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2
+    1 << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes wich are not powers of 2
   const int C =
-    1 << (rand() % 8 + 2); // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2
+    1 << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes wich are not powers of 2
   const int batch_size = 1 << (rand() % 4);
   const bool columns_batch = rand() % 2;
   const bool is_in_place = s_is_cuda_registered? 0 : rand() % 2; //TODO - fix inplace (Hadar: I'm not sure we should support it)
 
-  // const int R = 4; // cpu implementation for out of place trancpose also supports sizes wich are not powers of 2
-  // const int C = 3;
-  // const int batch_size = 1 << (1);
-  // const bool columns_batch = 1;
-  // const bool is_in_place = 1;
+  ICICLE_LOG_DEBUG << "rows = " << R;
+  ICICLE_LOG_DEBUG << "cols = " << C;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
 
-  // ICICLE_LOG_DEBUG << "R = " << R << ", C = " << C << ", batch_size = " << batch_size << ", columns_batch = " <<
-  // columns_batch << ", is_in_place = " << is_in_place; //TODO SHANIE - remove this
   const int total_size = R * C * batch_size;
   auto h_inout = std::make_unique<TypeParam[]>(total_size);
   auto h_out_main = std::make_unique<TypeParam[]>(total_size);
@@ -527,7 +497,7 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
     }
   };
 
-  // // Option 1: Initialize each input matrix in the batch with the same ascending values
+  // Option 1: Initialize each input matrix in the batch with the same ascending values
   // for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
   //   for (uint32_t i = 0; i < R * C; i++) {
   //     if(columns_batch){
@@ -538,7 +508,7 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
   //   }
   // }
 
-  // // Option 2: Initialize the entire input array with ascending values
+  // Option 2: Initialize the entire input array with ascending values
   // for (int i = 0; i < total_size; i++) {
   //   h_inout[i] = TypeParam::from(i);
   // }
@@ -568,19 +538,10 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
 
   run(s_main_target, (is_in_place ? h_inout.get() : h_out_main.get()), VERBOSE /*=measure*/, "transpose", ITERS);
 
-   // ICICLE_LOG_DEBUG << scalar_a[0] << ", ";
-  // for (int i = 0; i < total_size; i++) {
-  //   ICICLE_LOG_DEBUG << i << ", " << h_inout[i] << ", " << h_out_main[i] << ", " << h_out_ref[i];
-  // }
-
   if (is_in_place) {
     ASSERT_EQ(0, memcmp(h_inout.get(), h_out_ref.get(), total_size * sizeof(TypeParam)));
   } else {
-    // std::cout << "h_out_main:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << h_out_main[i] << ", "; }
-    // std::cout <<h_out_main[total_size-1]<<"]"<< std::endl; std::cout << " h_out_ref:\t["; for (int i = 0; i <
-    // total_size-1; i++) { std::cout <<  h_out_ref[i] << ", "; } std::cout << h_out_ref[total_size-1]<<"]"<< std::endl;
     ASSERT_EQ(0, memcmp(h_out_main.get(), h_out_ref.get(), total_size * sizeof(TypeParam)));
-    // }}//for loop TODO SHANIE - remove this
   }
 }
 
@@ -595,11 +556,10 @@ TYPED_TEST(FieldApiTest, bitReverse)
   const bool is_in_place = rand() % 2;
   const int total_size = N * batch_size;
 
-  // const uint64_t N = 1 << (3);
-  // const int batch_size = 1 << (1);
-  // const bool columns_batch = 1;
-  // const bool is_in_place = 0;
-  // const int total_size = N * batch_size;
+  ICICLE_LOG_DEBUG << "N = " << N;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
+  ICICLE_LOG_DEBUG << "is_in_place = " << is_in_place;
 
   auto in_a = std::make_unique<TypeParam[]>(total_size);
   auto out_main = std::make_unique<TypeParam[]>(total_size);
@@ -658,11 +618,8 @@ TYPED_TEST(FieldApiTest, bitReverse)
         }
         if (columns_batch) {
           out_ref[idx_in_batch + batch_size * i] = in_a[idx_in_batch + batch_size * rev];
-          // ICICLE_LOG_DEBUG << "out_ref[" << idx_in_batch + batch_size * i << "] = in_a[" << idx_in_batch + batch_size
-          // * rev << "]";
         } else {
           out_ref[idx_in_batch * N + i] = in_a[idx_in_batch * N + rev];
-          // ICICLE_LOG_DEBUG << "out_ref[" << idx_in_batch * N + i << "] = in_a[" << idx_in_batch * N + rev << "]";
         }
       }
     }
@@ -671,16 +628,9 @@ TYPED_TEST(FieldApiTest, bitReverse)
   }
   run(s_main_target, (is_in_place ? in_a.get() : out_main.get()), VERBOSE /*=measure*/, "bit-reverse", 1);
 
-  //   for (int i = 0; i < total_size; i++) {
-  //   ICICLE_LOG_DEBUG << i << ", " << in_a[i] << ", " << out_main[i] << ", " << out_ref[i];
-  // }
-
   if (is_in_place) {
     ASSERT_EQ(0, memcmp(in_a.get(), out_ref.get(), N * sizeof(TypeParam)));
   } else {
-    // std::cout << "out_main:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << out_main[i] << ", "; }
-    // std::cout <<out_main[total_size-1]<<"]"<< std::endl; std::cout << "out_ref:\t["; for (int i = 0; i <
-    // total_size-1; i++) { std::cout <<  out_ref[i] << ", "; } std::cout << out_ref[total_size-1]<<"]"<< std::endl;
     ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
   }
 }
@@ -697,21 +647,15 @@ TYPED_TEST(FieldApiTest, Slice)
   const int batch_size = 1 << (rand() % 5);
   const bool columns_batch = rand() % 2;
 
-  // const uint64_t size_in = 1 << (20);
-  // const uint64_t offset = 97;
-  // const uint64_t stride = 6;
-  // const uint64_t size_out = (((size_in - offset) / stride) - 1) - 100;
-
-  // ICICLE_LOG_DEBUG << size_in <<", "<< offset<<", "<<stride<<", "<<size_out;
-
-  // const int batch_size = 50;
-  // const bool columns_batch = 1;
-
+  ICICLE_LOG_DEBUG << "size_in = " << size_in;
+  ICICLE_LOG_DEBUG << "size_out = " << size_out;
+  ICICLE_LOG_DEBUG << "offset = " << offset;
+  ICICLE_LOG_DEBUG << "stride = " << stride;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
 
   const int total_size_in = size_in * batch_size;
   const int total_size_out = size_out * batch_size;
-  // ICICLE_LOG_DEBUG << "size_in = " << size_in << ", offset = " << offset << ", stride = " << stride << ", size_out =
-  // " << size_out << ", batch_size = " << batch_size << ", columns_batch = " << columns_batch;
 
   auto in_a = std::make_unique<TypeParam[]>(total_size_in);
   auto out_main = std::make_unique<TypeParam[]>(total_size_out);
@@ -768,14 +712,6 @@ TYPED_TEST(FieldApiTest, Slice)
     run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "slice", 1);
   }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "slice", 1);
-  // std::cout << "out_main\t["; for (int i = 0; i < total_size_out-1; i++) { std::cout << out_main[i] << ", "; }
-  // std::cout <<out_main[total_size_out-1]<<"]"<< std::endl; std::cout << "out_ref:\t["; for (int i = 0; i <
-  // total_size_out-1; i++) { std::cout <<  out_ref[i] << ", "; } std::cout << out_ref[total_size_out-1]<<"]"<<
-  // std::endl;
-
-  //   for (int i = 0; i < total_size_in; i++) {
-  //   ICICLE_LOG_DEBUG << i << ", " << in_a[i] << ", " << out_main[i] << ", " << out_ref[i];
-  // }
 
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size_out * sizeof(TypeParam)));
 }
@@ -788,11 +724,12 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx)
   const uint64_t N = 1 << (rand() % 15 + 3);
   const int batch_size = 1 << (rand() % 5);
   const bool columns_batch = rand() % 2;
-  // const uint64_t N = 1 << (20);
-  // const int batch_size = 1 << (0);
-  // const bool columns_batch = 0;
   const int total_size = N * batch_size;
 
+  ICICLE_LOG_DEBUG << "N = " << N;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
+
   auto in_a = std::make_unique<TypeParam[]>(total_size);
   auto out_main = std::make_unique<int64_t[]>(batch_size);
   auto out_ref = std::make_unique<int64_t[]>(batch_size);
@@ -827,12 +764,6 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx)
   }
   if (s_is_cuda_registered) { run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "highest_non_zero_idx", 1); }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "highest_non_zero_idx", 1);
-  // std::cout << "out_main:\t["; for (int i = 0; i < batch_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout
-  // <<out_main[batch_size-1]<<"]"<< std::endl; std::cout << "out_ref:\t["; for (int i = 0; i < batch_size-1; i++) {
-  // std::cout <<  out_ref[i] << ", "; } std::cout << out_ref[batch_size-1]<<"]"<< std::endl;
-  //   for (int i = 0; i < batch_size; i++) {
-  //   ICICLE_LOG_DEBUG << i << ", " << out_main[i] << ", " << out_ref[i];
-  // }
 
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(int64_t)));
 }
@@ -847,10 +778,10 @@ TYPED_TEST(FieldApiTest, polynomialEval)
   const int batch_size = 1 << (rand() % 5);
   const bool columns_batch = rand() % 2;
   
-  // const uint64_t coeffs_size = 1 << (3);
-  // const uint64_t domain_size = 3;
-  // const int batch_size = 1 << (1);
-  // const bool columns_batch = 1;
+  ICICLE_LOG_DEBUG << "coeffs_size = " << coeffs_size;
+  ICICLE_LOG_DEBUG << "domain_size = " << domain_size;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
   
   const int total_coeffs_size = coeffs_size * batch_size;
 
@@ -879,134 +810,129 @@ TYPED_TEST(FieldApiTest, polynomialEval)
   FieldApiTest<TypeParam>::random_samples(in_coeffs.get(), total_coeffs_size);
   FieldApiTest<TypeParam>::random_samples(in_domain.get(), domain_size);
 
-  // Reference implementation
-  // TODO - Check in comperison with GPU implementation
+  // Reference implementation - TODO
 
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "polynomial_eval", 1);
   if (s_is_cuda_registered) {
     run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "polynomial_eval", 1);
-    // std::cout << "out_main:\t["; for (int i = 0; i < total_coeffs_size-1; i++) { std::cout << out_main[i] << ", "; }
-    // std::cout <<out_main[total_coeffs_size-1]<<"]"<< std::endl; std::cout << "out_ref:\t["; for (int i = 0; i <
-    // total_coeffs_size-1; i++) { std::cout <<  out_ref[i] << ", "; } std::cout << out_ref[total_coeffs_size-1]<<"]"<<
-    // std::endl;
     ASSERT_EQ(
       0, memcmp(
            out_main.get(), out_ref.get(),
-           total_coeffs_size * sizeof(TypeParam))); // TODO - Check in comperison with GPU implementation
+           total_coeffs_size * sizeof(TypeParam))); 
   }
 }
 
-TYPED_TEST(FieldApiTest, polynomialDivision)
-{
-  int seed = time(0);
-  srand(seed);
-  // ICICLE_LOG_DEBUG << "seed = " << seed;
-  // const int64_t numerator_deg = 1 << 4;
-  // const int64_t denumerator_deg = 1 << 2;
-  // const uint64_t q_size = numerator_deg - denumerator_deg + 1;
-  // const uint64_t r_size = numerator_deg + 1;
-  const int64_t numerator_deg = 3;
-  const int64_t denumerator_deg = 2;
-  const uint64_t q_size = 2;
-  const uint64_t r_size = 4;
-  // const int batch_size = 1 << (rand() % 5);
-  const int batch_size = 1;
-  const bool columns_batch = rand() % 2;
-
-  const int64_t total_numerator_size = (numerator_deg + 1) * batch_size;
-  const int64_t total_denumerator_size = (denumerator_deg + 1) * batch_size;
-  const uint64_t total_q_size = q_size * batch_size;
-  const uint64_t total_r_size = r_size * batch_size;
-
-  auto numerator = std::make_unique<TypeParam[]>(total_numerator_size);
-  auto denumerator = std::make_unique<TypeParam[]>(total_denumerator_size);
-  auto q_out_main = std::make_unique<TypeParam[]>(total_q_size);
-  auto r_out_main = std::make_unique<TypeParam[]>(total_r_size);
-  auto q_out_ref = std::make_unique<TypeParam[]>(total_q_size);
-  auto r_out_ref = std::make_unique<TypeParam[]>(total_r_size);
-
-  auto run =
-    [&](const std::string& dev_type, TypeParam* q_out, TypeParam* r_out, bool measure, const char* msg, int iters) {
-      Device dev = {dev_type, 0};
-      icicle_set_device(dev);
-      auto config = default_vec_ops_config();
-      config.batch_size = batch_size;
-      config.columns_batch = columns_batch;
-
-      std::ostringstream oss;
-      oss << dev_type << " " << msg;
-
-      START_TIMER(polynomialDivision)
-      for (int i = 0; i < iters; ++i) {
-        ICICLE_CHECK(polynomial_division(
-          numerator.get(), numerator_deg, total_numerator_size, denumerator.get(), denumerator_deg, total_denumerator_size, q_size, r_size, config, q_out, r_out));
-      }
-      END_TIMER(polynomialDivision, oss.str().c_str(), measure);
-    };
-
-  // // Option 1: Initialize input vectors with random values
-  // FieldApiTest<TypeParam>::random_samples(numerator.get(), total_numerator_size);
-  // FieldApiTest<TypeParam>::random_samples(denumerator.get(), total_denumerator_size);
-  // // Reference implementation
-  // TODO - Check in comperison with GPU implementation or implement a general reference implementation
-
-  // Option 2: Initialize the numerator and denumerator with chosen example
-  //           And the reference implementation for the example
-
-  for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
-    if (columns_batch) {
-      // numerator = 3x^3+4x^2+5
-      numerator[idx_in_batch + 0 * batch_size] = TypeParam::from(5);
-      numerator[idx_in_batch + 1 * batch_size] = TypeParam::from(0);
-      numerator[idx_in_batch + 2 * batch_size] = TypeParam::from(4);
-      numerator[idx_in_batch + 3 * batch_size] = TypeParam::from(3);
-      // denumerator = x^2-1
-      denumerator[idx_in_batch + 0 * batch_size] = TypeParam::from(0) - TypeParam::from(1);
-      denumerator[idx_in_batch + 1 * batch_size] = TypeParam::from(0);
-      denumerator[idx_in_batch + 2 * batch_size] = TypeParam::from(1);
-      if (!s_is_cuda_registered) {
-        // q_out_ref = 3x+4
-        q_out_ref[idx_in_batch + 0 * batch_size] = TypeParam::from(4);
-        q_out_ref[idx_in_batch + 1 * batch_size] = TypeParam::from(3);
-        // r_out_ref = 3x+9
-        r_out_ref[idx_in_batch + 0 * batch_size] = TypeParam::from(9);
-        r_out_ref[idx_in_batch + 1 * batch_size] = TypeParam::from(3);
-      }
-    } else {
-      // numerator = 3x^3+4x^2+5
-      numerator[idx_in_batch * (numerator_deg + 1) + 0] = TypeParam::from(5);
-      numerator[idx_in_batch * (numerator_deg + 1) + 1] = TypeParam::from(0);
-      numerator[idx_in_batch * (numerator_deg + 1) + 2] = TypeParam::from(4);
-      numerator[idx_in_batch * (numerator_deg + 1) + 3] = TypeParam::from(3);
-      // denumerator = x^2-1
-      denumerator[idx_in_batch * (denumerator_deg + 1) + 0] = TypeParam::from(0) - TypeParam::from(1);
-      denumerator[idx_in_batch * (denumerator_deg + 1) + 1] = TypeParam::from(0);
-      denumerator[idx_in_batch * (denumerator_deg + 1) + 2] = TypeParam::from(1);
-      if (!s_is_cuda_registered) {
-        // q_out_ref = 3x+4
-        q_out_ref[idx_in_batch * q_size + 0] = TypeParam::from(4);
-        q_out_ref[idx_in_batch * q_size + 1] = TypeParam::from(3);
-        // r_out_ref = 3x+9
-        r_out_ref[idx_in_batch * r_size + 0] = TypeParam::from(9);
-        r_out_ref[idx_in_batch * r_size + 1] = TypeParam::from(3);
-      }
-    }
-  }
+// TYPED_TEST(FieldApiTest, polynomialDivision)
+// {
+//   int seed = time(0);
+//   srand(seed);
+//   ICICLE_LOG_DEBUG << "seed = " << seed;
+//   // const int64_t numerator_deg = 1 << 4;
+//   // const int64_t denumerator_deg = 1 << 2;
+//   // const uint64_t q_size = numerator_deg - denumerator_deg + 1;
+//   // const uint64_t r_size = numerator_deg + 1;
+//   const int64_t numerator_deg = 3;
+//   const int64_t denumerator_deg = 2;
+//   const uint64_t q_size = 2;
+//   const uint64_t r_size = 4;
+//   // const int batch_size = 1 << (rand() % 5);
+//   const int batch_size = 1;
+//   const bool columns_batch = rand() % 2;
+
+//   const int64_t total_numerator_size = (numerator_deg + 1) * batch_size;
+//   const int64_t total_denumerator_size = (denumerator_deg + 1) * batch_size;
+//   const uint64_t total_q_size = q_size * batch_size;
+//   const uint64_t total_r_size = r_size * batch_size;
+
+//   auto numerator = std::make_unique<TypeParam[]>(total_numerator_size);
+//   auto denumerator = std::make_unique<TypeParam[]>(total_denumerator_size);
+//   auto q_out_main = std::make_unique<TypeParam[]>(total_q_size);
+//   auto r_out_main = std::make_unique<TypeParam[]>(total_r_size);
+//   auto q_out_ref = std::make_unique<TypeParam[]>(total_q_size);
+//   auto r_out_ref = std::make_unique<TypeParam[]>(total_r_size);
+
+//   auto run =
+//     [&](const std::string& dev_type, TypeParam* q_out, TypeParam* r_out, bool measure, const char* msg, int iters) {
+//       Device dev = {dev_type, 0};
+//       icicle_set_device(dev);
+//       auto config = default_vec_ops_config();
+//       config.batch_size = batch_size;
+//       config.columns_batch = columns_batch;
+
+//       std::ostringstream oss;
+//       oss << dev_type << " " << msg;
+
+//       START_TIMER(polynomialDivision)
+//       for (int i = 0; i < iters; ++i) {
+//         ICICLE_CHECK(polynomial_division(
+//           numerator.get(), numerator_deg, total_numerator_size, denumerator.get(), denumerator_deg, total_denumerator_size, q_size, r_size, config, q_out, r_out));
+//       }
+//       END_TIMER(polynomialDivision, oss.str().c_str(), measure);
+//     };
+
+//   // // Option 1: Initialize input vectors with random values
+//   // FieldApiTest<TypeParam>::random_samples(numerator.get(), total_numerator_size);
+//   // FieldApiTest<TypeParam>::random_samples(denumerator.get(), total_denumerator_size);
+//   // // Reference implementation
+//   // TODO - Check in comperison with GPU implementation or implement a general reference implementation
+
+//   // Option 2: Initialize the numerator and denumerator with chosen example
+//   //           And the reference implementation for the example
+
+//   for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+//     if (columns_batch) {
+//       // numerator = 3x^3+4x^2+5
+//       numerator[idx_in_batch + 0 * batch_size] = TypeParam::from(5);
+//       numerator[idx_in_batch + 1 * batch_size] = TypeParam::from(0);
+//       numerator[idx_in_batch + 2 * batch_size] = TypeParam::from(4);
+//       numerator[idx_in_batch + 3 * batch_size] = TypeParam::from(3);
+//       // denumerator = x^2-1
+//       denumerator[idx_in_batch + 0 * batch_size] = TypeParam::from(0) - TypeParam::from(1);
+//       denumerator[idx_in_batch + 1 * batch_size] = TypeParam::from(0);
+//       denumerator[idx_in_batch + 2 * batch_size] = TypeParam::from(1);
+//       if (!s_is_cuda_registered) {
+//         // q_out_ref = 3x+4
+//         q_out_ref[idx_in_batch + 0 * batch_size] = TypeParam::from(4);
+//         q_out_ref[idx_in_batch + 1 * batch_size] = TypeParam::from(3);
+//         // r_out_ref = 3x+9
+//         r_out_ref[idx_in_batch + 0 * batch_size] = TypeParam::from(9);
+//         r_out_ref[idx_in_batch + 1 * batch_size] = TypeParam::from(3);
+//       }
+//     } else {
+//       // numerator = 3x^3+4x^2+5
+//       numerator[idx_in_batch * (numerator_deg + 1) + 0] = TypeParam::from(5);
+//       numerator[idx_in_batch * (numerator_deg + 1) + 1] = TypeParam::from(0);
+//       numerator[idx_in_batch * (numerator_deg + 1) + 2] = TypeParam::from(4);
+//       numerator[idx_in_batch * (numerator_deg + 1) + 3] = TypeParam::from(3);
+//       // denumerator = x^2-1
+//       denumerator[idx_in_batch * (denumerator_deg + 1) + 0] = TypeParam::from(0) - TypeParam::from(1);
+//       denumerator[idx_in_batch * (denumerator_deg + 1) + 1] = TypeParam::from(0);
+//       denumerator[idx_in_batch * (denumerator_deg + 1) + 2] = TypeParam::from(1);
+//       if (!s_is_cuda_registered) {
+//         // q_out_ref = 3x+4
+//         q_out_ref[idx_in_batch * q_size + 0] = TypeParam::from(4);
+//         q_out_ref[idx_in_batch * q_size + 1] = TypeParam::from(3);
+//         // r_out_ref = 3x+9
+//         r_out_ref[idx_in_batch * r_size + 0] = TypeParam::from(9);
+//         r_out_ref[idx_in_batch * r_size + 1] = TypeParam::from(3);
+//       }
+//     }
+//   }
 
-  if (s_is_cuda_registered) {
-    run(s_reference_target, q_out_ref.get(), r_out_ref.get(), VERBOSE /*=measure*/, "polynomial_division", 1);
-  }
-  std::cout << "numerator:\t["; for (int i = 0; i < total_numerator_size-1; i++) { std::cout << numerator[i] << ", ";
-  } std::cout <<numerator[total_numerator_size-1]<<"]"<< std::endl; std::cout << "denumerator:\t["; for (int i = 0; i
-  < total_denumerator_size-1; i++) { std::cout << denumerator[i] << ", "; } std::cout
-  <<denumerator[total_denumerator_size-1]<<"]"<< std::endl; std::cout << "q_out_ref:\t["; for (int i = 0; i <
-  total_q_size-1; i++) { std::cout <<  q_out_ref[i] << ", "; } std::cout << q_out_ref[total_q_size-1]<<"]"<<
-  std::endl; std::cout << "r_out_ref:\t["; for (int i = 0; i < total_r_size-1; i++) { std::cout <<  r_out_ref[i] <<
-  ", "; } std::cout << r_out_ref[total_r_size-1]<<"]"<< std::endl;
-  run(s_main_target, q_out_main.get(), r_out_main.get(), VERBOSE /*=measure*/, "polynomial_division", 1);
-  ASSERT_EQ(0, memcmp(q_out_main.get(), q_out_ref.get(), total_q_size * sizeof(TypeParam)));
-  ASSERT_EQ(0, memcmp(r_out_main.get(), r_out_ref.get(), total_r_size * sizeof(TypeParam)));
-}
+//   if (s_is_cuda_registered) {
+//     run(s_reference_target, q_out_ref.get(), r_out_ref.get(), VERBOSE /*=measure*/, "polynomial_division", 1);
+//   }
+//   std::cout << "numerator:\t["; for (int i = 0; i < total_numerator_size-1; i++) { std::cout << numerator[i] << ", ";
+//   } std::cout <<numerator[total_numerator_size-1]<<"]"<< std::endl; std::cout << "denumerator:\t["; for (int i = 0; i
+//   < total_denumerator_size-1; i++) { std::cout << denumerator[i] << ", "; } std::cout
+//   <<denumerator[total_denumerator_size-1]<<"]"<< std::endl; std::cout << "q_out_ref:\t["; for (int i = 0; i <
+//   total_q_size-1; i++) { std::cout <<  q_out_ref[i] << ", "; } std::cout << q_out_ref[total_q_size-1]<<"]"<<
+//   std::endl; std::cout << "r_out_ref:\t["; for (int i = 0; i < total_r_size-1; i++) { std::cout <<  r_out_ref[i] <<
+//   ", "; } std::cout << r_out_ref[total_r_size-1]<<"]"<< std::endl;
+//   run(s_main_target, q_out_main.get(), r_out_main.get(), VERBOSE /*=measure*/, "polynomial_division", 1);
+//   ASSERT_EQ(0, memcmp(q_out_main.get(), q_out_ref.get(), total_q_size * sizeof(TypeParam)));
+//   ASSERT_EQ(0, memcmp(r_out_main.get(), r_out_ref.get(), total_r_size * sizeof(TypeParam)));
+// }
 
 // #ifdef NTT
 // TYPED_TEST(FieldApiTest, ntt)

From 979807346288ece455578365682edb04931cd9dc Mon Sep 17 00:00:00 2001
From: hadaringonyama <hadar@ingonyama.com>
Date: Thu, 31 Oct 2024 21:30:40 +0200
Subject: [PATCH 27/43] formatting and spelling

---
 examples/c++/vector-api/example.cpp          |  2 +-
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 45 +++++---------------
 icicle/tests/test_field_api.cpp              | 29 ++++++-------
 icicle_v3/include/icicle/mmcs.h              | 30 +++++++++++++
 4 files changed, 56 insertions(+), 50 deletions(-)
 create mode 100644 icicle_v3/include/icicle/mmcs.h

diff --git a/examples/c++/vector-api/example.cpp b/examples/c++/vector-api/example.cpp
index 2a998c5c7..5c4497a64 100644
--- a/examples/c++/vector-api/example.cpp
+++ b/examples/c++/vector-api/example.cpp
@@ -7,7 +7,7 @@
 #include "icicle/utils/log.h"
 
 
-// SP: I undertstand this code is auto-generated, but I can't get scrip/gen to work. 
+// SP: I understand this code is auto-generated, but I can't get script/gen to work. 
 
 extern "C" eIcicleError bn254_vector_product(
   const bn254::scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result, uint64_t offset, uint64_t  stride);
diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index f27ab5600..826fb3bd2 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -317,7 +317,7 @@ class VectorOpTask : public TaskBase
     }
   }
 
-  // Single worker functionality for out of palce matrix transpose
+  // Single worker functionality for out of place matrix transpose
   void out_of_place_transpose()
   {
     for (uint32_t k = 0; k < m_nof_operations; ++k) {
@@ -367,8 +367,8 @@ class VectorOpTask : public TaskBase
 
 public:
   T m_intermidiate_res;    // pointer to the output. Can be a vector or scalar pointer
-  uint64_t m_idx_in_batch; // index in the batch. Used in intermidiate res tasks
-}; // class VectorOpTask
+  uint64_t m_idx_in_batch; // index in the batch. Used in intermediate res tasks
+};                         // class VectorOpTask
 
 #define NOF_OPERATIONS_PER_TASK 512
 #define CONFIG_NOF_THREADS_KEY  "n_threads"
@@ -401,12 +401,7 @@ cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t size,
 // Execute a full task from the type vector = scalar (op) vector
 template <typename T>
 eIcicleError cpu_scalar_vector_op(
-  VecOperation op,
-  const T* scalar_a,
-  const T* vec_b,
-  uint64_t size,
-  const VecOpsConfig& config,
-  T* output)
+  VecOperation op, const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
   TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
   const uint64_t total_nof_operations = size;
@@ -416,11 +411,8 @@ eIcicleError cpu_scalar_vector_op(
       VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
       task_p->send_2ops_task(
         op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), scalar_a + idx_in_batch,
-        config.columns_batch ? vec_b + idx_in_batch + i * config.batch_size
-                                                     : vec_b + idx_in_batch * size + i,
-        stride,
-        config.columns_batch ? output + idx_in_batch + i * config.batch_size
-                                                     : output + idx_in_batch * size + i);
+        config.columns_batch ? vec_b + idx_in_batch + i * config.batch_size : vec_b + idx_in_batch * size + i, stride,
+        config.columns_batch ? output + idx_in_batch + i * config.batch_size : output + idx_in_batch * size + i);
     }
   }
   task_manager.wait_done();
@@ -590,12 +582,7 @@ REGISTER_VECTOR_PRODUCT_BACKEND("CPU", cpu_vector_product<scalar_t>);
 /*********************************** Scalar + Vector***********************************/
 template <typename T>
 eIcicleError cpu_scalar_add(
-  const Device& device,
-  const T* scalar_a,
-  const T* vec_b,
-  uint64_t size,
-  const VecOpsConfig& config,
-  T* output)
+  const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
   return cpu_scalar_vector_op(VecOperation::SCALAR_ADD_VEC, scalar_a, vec_b, size, config, output);
 }
@@ -605,12 +592,7 @@ REGISTER_SCALAR_ADD_VEC_BACKEND("CPU", cpu_scalar_add<scalar_t>);
 /*********************************** Scalar - Vector***********************************/
 template <typename T>
 eIcicleError cpu_scalar_sub(
-  const Device& device,
-  const T* scalar_a,
-  const T* vec_b,
-  uint64_t size,
-  const VecOpsConfig& config,
-  T* output)
+  const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
   return cpu_scalar_vector_op(VecOperation::SCALAR_SUB_VEC, scalar_a, vec_b, size, config, output);
 }
@@ -620,12 +602,7 @@ REGISTER_SCALAR_SUB_VEC_BACKEND("CPU", cpu_scalar_sub<scalar_t>);
 /*********************************** MUL BY SCALAR***********************************/
 template <typename T>
 eIcicleError cpu_scalar_mul(
-  const Device& device,
-  const T* scalar_a,
-  const T* vec_b,
-  uint64_t size,
-  const VecOpsConfig& config,
-  T* output)
+  const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
   return cpu_scalar_vector_op(VecOperation::SCALAR_MUL_VEC, scalar_a, vec_b, size, config, output);
 }
@@ -669,7 +646,7 @@ uint32_t gcd(uint32_t a, uint32_t b)
   return a;
 }
 
-// Recursive function to generate all k-ary necklaces and to replace the elements withing the necklaces
+// Recursive function to generate all k-ary necklaces and to replace the elements within the necklaces
 template <typename T>
 void gen_necklace(
   uint32_t t,
@@ -714,7 +691,7 @@ eIcicleError matrix_transpose_necklaces(
   uint32_t gcd_value = gcd(log_nof_rows, log_nof_cols);
   uint32_t k = 1 << gcd_value; // Base of necklaces
   uint32_t length =
-    (log_nof_cols + log_nof_rows) / gcd_value; // length of necklaces. Since all are powers of 2, equvalent to
+    (log_nof_cols + log_nof_rows) / gcd_value; // length of necklaces. Since all are powers of 2, equivalent to
                                                // (log_nof_cols + log_nof_rows) / gcd_value;
   const uint64_t max_nof_operations = NOF_OPERATIONS_PER_TASK / length;
   const uint64_t total_elements_one_mat = static_cast<uint64_t>(nof_rows) * nof_cols;
diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index e43c60fff..1c44464cb 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -17,7 +17,7 @@
 using namespace field_config;
 using namespace icicle;
 
-//TODO - add tests that test different configurations of data on device or on host.
+// TODO - add tests that test different configurations of data on device or on host.
 
 using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
@@ -98,11 +98,11 @@ TYPED_TEST(FieldApiTest, vectorVectorOps)
   const uint64_t N = 1 << (rand() % 15 + 3);
   const int batch_size = 1 << (rand() % 5);
   const bool columns_batch = rand() % 2;
-  
+
   ICICLE_LOG_DEBUG << "N = " << N;
   ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
   ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
-  
+
   const int total_size = N * batch_size;
   auto in_a = std::make_unique<TypeParam[]>(total_size);
   auto in_b = std::make_unique<TypeParam[]>(total_size);
@@ -148,7 +148,7 @@ TYPED_TEST(FieldApiTest, vectorVectorOps)
   // accumulate
   FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
   FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
-  for (int i = 0; i < total_size; i++) { //TODO - compare gpu against cpu with inplace operations?
+  for (int i = 0; i < total_size; i++) { // TODO - compare gpu against cpu with inplace operations?
     out_ref[i] = in_a[i] + in_b[i];
   }
   run(s_main_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS);
@@ -391,7 +391,7 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
     run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_add_vec<TypeParam>, "scalar add vec", ITERS);
   }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_add_vec<TypeParam>, "scalar add vec", ITERS);
-  
+
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 
   // scalar sub vec
@@ -436,12 +436,13 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
   srand(seed);
   ICICLE_LOG_DEBUG << "seed = " << seed;
   const int R =
-    1 << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes wich are not powers of 2
+    1 << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes which are not powers of 2
   const int C =
-    1 << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes wich are not powers of 2
+    1 << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes which are not powers of 2
   const int batch_size = 1 << (rand() % 4);
   const bool columns_batch = rand() % 2;
-  const bool is_in_place = s_is_cuda_registered? 0 : rand() % 2; //TODO - fix inplace (Hadar: I'm not sure we should support it)
+  const bool is_in_place =
+    s_is_cuda_registered ? 0 : rand() % 2; // TODO - fix inplace (Hadar: I'm not sure we should support it)
 
   ICICLE_LOG_DEBUG << "rows = " << R;
   ICICLE_LOG_DEBUG << "cols = " << C;
@@ -777,12 +778,12 @@ TYPED_TEST(FieldApiTest, polynomialEval)
   const uint64_t domain_size = 1 << (rand() % 8 + 2);
   const int batch_size = 1 << (rand() % 5);
   const bool columns_batch = rand() % 2;
-  
+
   ICICLE_LOG_DEBUG << "coeffs_size = " << coeffs_size;
   ICICLE_LOG_DEBUG << "domain_size = " << domain_size;
   ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
   ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
-  
+
   const int total_coeffs_size = coeffs_size * batch_size;
 
   auto in_coeffs = std::make_unique<TypeParam[]>(total_coeffs_size);
@@ -815,10 +816,7 @@ TYPED_TEST(FieldApiTest, polynomialEval)
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "polynomial_eval", 1);
   if (s_is_cuda_registered) {
     run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "polynomial_eval", 1);
-    ASSERT_EQ(
-      0, memcmp(
-           out_main.get(), out_ref.get(),
-           total_coeffs_size * sizeof(TypeParam))); 
+    ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_coeffs_size * sizeof(TypeParam)));
   }
 }
 
@@ -865,7 +863,8 @@ TYPED_TEST(FieldApiTest, polynomialEval)
 //       START_TIMER(polynomialDivision)
 //       for (int i = 0; i < iters; ++i) {
 //         ICICLE_CHECK(polynomial_division(
-//           numerator.get(), numerator_deg, total_numerator_size, denumerator.get(), denumerator_deg, total_denumerator_size, q_size, r_size, config, q_out, r_out));
+//           numerator.get(), numerator_deg, total_numerator_size, denumerator.get(), denumerator_deg,
+//           total_denumerator_size, q_size, r_size, config, q_out, r_out));
 //       }
 //       END_TIMER(polynomialDivision, oss.str().c_str(), measure);
 //     };
diff --git a/icicle_v3/include/icicle/mmcs.h b/icicle_v3/include/icicle/mmcs.h
new file mode 100644
index 000000000..94394b822
--- /dev/null
+++ b/icicle_v3/include/icicle/mmcs.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include "errors.h"
+#include "runtime.h"
+#include "hash.h"
+#include "merkle_tree.h"
+#include "icicle/utils/utils.h"
+
+#include <cstdint>
+#include <functional>
+
+
+template <typename T>
+  struct Matrix {
+    T* values;
+    size_t width;
+    size_t height;
+  };
+
+eIcicleError build_mmcs_tree(const Matrix<limb_t>* inputs,
+    const unsigned int number_of_inputs,
+    limb_t** outputs,
+    const Hash& hash,
+    const Hash& compression,
+    const MerkleTreeConfig& config);
+    
+    //create hash <-hasher,compressor
+
+    //sort, and call merkle tree
+    //how to return outputs?
\ No newline at end of file

From 32bd7808d18e280c2df2017dc70a87cd85f0f6f1 Mon Sep 17 00:00:00 2001
From: hadaringonyama <hadar@ingonyama.com>
Date: Thu, 31 Oct 2024 21:51:01 +0200
Subject: [PATCH 28/43] ntt test

---
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp |   2 +-
 icicle/tests/test_field_api.cpp              | 226 ++++++++-----------
 2 files changed, 97 insertions(+), 131 deletions(-)

diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index 826fb3bd2..24e53fa59 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -350,7 +350,7 @@ class VectorOpTask : public TaskBase
 
   VecOperation m_operation;  // the operation to execute
   uint32_t m_nof_operations; // number of operations to execute for this task
-  const T* m_op_a;           // pointer to operand A. Operand A is a vector, or metrix in case of replace_elements
+  const T* m_op_a;           // pointer to operand A. Operand A is a vector, or matrix in case of replace_elements
   const T* m_op_b;           // pointer to operand B. Operand B is a vector or scalar
   uint64_t m_start_index;    // index used in bitreverse operation and out of place matrix transpose
   uint64_t m_stop_index;     // index used in reduce operations and out of place matrix transpose
diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index 1c44464cb..e45a3ae0b 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -933,138 +933,104 @@ TYPED_TEST(FieldApiTest, polynomialEval)
 //   ASSERT_EQ(0, memcmp(r_out_main.get(), r_out_ref.get(), total_r_size * sizeof(TypeParam)));
 // }
 
-// #ifdef NTT
-// TYPED_TEST(FieldApiTest, ntt)
-// {
-//   // ICICLE_LOG_INFO << "Current branch: " << get_current_branch();
-//   ICICLE_LOG_DEBUG << "ICICLE_LOG_DEBUG";
-//   // for (int i = 3; i < 23; ++i) {
-//   // //Randomize configuration
-
-//   // int seed = time(0) + i;
-//   // // int seed = 1726493105;
-//   // srand(seed);
-//   // const bool inplace = rand() % 2;
-//   // const int logn = rand() % 17 + 3;
-//   // // const int logn = rand() % 14 + 3;
-//   // // const int logn = 16;
-//   // const uint64_t N = 1 << logn;
-//   // const int log_ntt_domain_size = logn + 1;
-//   // const int log_batch_size = rand() % 3;
-//   // const int batch_size = 1 << log_batch_size;
-//   // const Ordering ordering = static_cast<Ordering>(rand() % 4);
-//   // bool columns_batch;
-//   // if (logn == 7 || logn < 4) {
-//   //   columns_batch = false; // currently not supported (icicle_v3/backend/cuda/src/ntt/ntt.cuh line 578)
-//   // } else {
-//   //   // columns_batch = true;
-//   //   columns_batch = rand() % 2;
-//   // }
-//   // // const NTTDir dir = static_cast<NTTDir>(rand() % 2); // 0: forward, 1: inverse
-//   // const NTTDir dir = static_cast<NTTDir>(0); // 0: forward, 1: inverse
-//   // const int log_coset_stride = rand() % 3;
-//   // scalar_t coset_gen;
-//   // if (log_coset_stride) {
-//   //   coset_gen = scalar_t::omega(logn + log_coset_stride);
-//   // } else {
-//   //   coset_gen = scalar_t::one();
-//   // }
-
-//   const bool inplace = false;
-//   const int logn = 15;
-//   const uint64_t N = 1 << logn;
-//   const int log_ntt_domain_size = logn;
-//   const int log_batch_size = 0;
-//   const int batch_size = 1 << log_batch_size;
-//   const Ordering ordering = static_cast<Ordering>(0);
-//   bool columns_batch = false;
-//   const NTTDir dir = static_cast<NTTDir>(0); // 0: forward, 1: inverse
-//   const int log_coset_stride = 0;
-//   scalar_t coset_gen;
-//   if (log_coset_stride) {
-//     coset_gen = scalar_t::omega(logn + log_coset_stride);
-//   } else {
-//     coset_gen = scalar_t::one();
-//   }
+#ifdef NTT
 
-// // TODO SHANIE : remove
-//   // ICICLE_LOG_INFO << "NTT test: seed=" << seed;
-//   // ICICLE_LOG_INFO << "NTT test: omega=" << scalar_t::omega(logn);
-//   // ICICLE_LOG_INFO << "NTT test:s inplace=" << inplace;
-//   ICICLE_LOG_INFO << "NTT test: logn=" << logn;
-//   // ICICLE_LOG_INFO << "NTT test: log_ntt_domain_size=" << log_ntt_domain_size;
-//   // ICICLE_LOG_INFO << "NTT test: log_batch_size=" << log_batch_size;
-//   // ICICLE_LOG_INFO << "NTT test: columns_batch=" << columns_batch;
-//   // ICICLE_LOG_INFO << "NTT test: ordering=" << int(ordering);
-//   ICICLE_LOG_INFO << "NTT test: dir=" << (dir == NTTDir::kForward ? "forward" : "inverse");
-//   ICICLE_LOG_INFO << "NTT test: log_coset_stride=" << log_coset_stride;
-//   ICICLE_LOG_INFO << "NTT test: coset_gen=" << coset_gen;
-
-//   const int total_size = N * batch_size;
-//   auto scalars = std::make_unique<TypeParam[]>(total_size);
-//   FieldApiTest<TypeParam>::random_samples(scalars.get(), total_size);
-//   // for (int i = 0; i < total_size; i++) { scalars[i] = scalar_t::from(i); } //FIXME SHANIE: remove
-//   auto out_main = std::make_unique<TypeParam[]>(total_size);
-//   auto out_ref = std::make_unique<TypeParam[]>(total_size);
-//   auto run = [&](const std::string& dev_type, TypeParam* out, const char* msg, bool measure, int iters) {
-//     Device dev = {dev_type, 0};
-//     icicle_set_device(dev);
-//     icicleStreamHandle stream = nullptr;
-//     ICICLE_CHECK(icicle_create_stream(&stream));
-//     auto init_domain_config = default_ntt_init_domain_config();
-//     init_domain_config.stream = stream;
-//     init_domain_config.is_async = false;
-//     ConfigExtension ext;
-//     ext.set(CudaBackendConfig::CUDA_NTT_FAST_TWIDDLES_MODE, true);
-//     init_domain_config.ext = &ext;
-//     auto config = default_ntt_config<scalar_t>();
-//     config.stream = stream;
-//     config.coset_gen = coset_gen;
-//     config.batch_size = batch_size;       // default: 1
-//     config.columns_batch = columns_batch; // default: false
-//     config.ordering = ordering;           // default: kNN
-//     config.are_inputs_on_device = true;
-//     config.are_outputs_on_device = true;
-//     config.is_async = false;
-//     ICICLE_CHECK(ntt_init_domain(scalar_t::omega(log_ntt_domain_size), init_domain_config));
-//     TypeParam *d_in, *d_out;
-//     ICICLE_CHECK(icicle_malloc_async((void**)&d_in, total_size * sizeof(TypeParam), config.stream));
-//     ICICLE_CHECK(icicle_malloc_async((void**)&d_out, total_size * sizeof(TypeParam), config.stream));
-//     ICICLE_CHECK(icicle_copy_to_device_async(d_in, scalars.get(), total_size * sizeof(TypeParam), config.stream));
-//     std::ostringstream oss;
-//     oss << dev_type << " " << msg;
-//     START_TIMER(NTT_sync)
-//     for (int i = 0; i < iters; ++i) {
-//       if (inplace) {
-//         ICICLE_CHECK(ntt(d_in, N, dir, config, d_in));
-//       } else {
-//         ICICLE_CHECK(ntt(d_in, N, dir, config, d_out));
-//       }
-//     }
-//     END_TIMER(NTT_sync, oss.str().c_str(), measure);
+TYPED_TEST(FieldApiTest, ntt)
+{
+  // Randomize configuration
 
-//     if (inplace) {
-//       ICICLE_CHECK(icicle_copy_to_host_async(out, d_in, total_size * sizeof(TypeParam), config.stream));
-//     } else {
-//       ICICLE_CHECK(icicle_copy_to_host_async(out, d_out, total_size * sizeof(TypeParam), config.stream));
-//     }
-//     ICICLE_CHECK(icicle_free_async(d_in, config.stream));
-//     ICICLE_CHECK(icicle_free_async(d_out, config.stream));
-//     ICICLE_CHECK(icicle_stream_synchronize(config.stream));
-//     ICICLE_CHECK(icicle_destroy_stream(stream));
-//     ICICLE_CHECK(ntt_release_domain<scalar_t>());
-//   };
-//   // run(s_main_target, out_main.get(), "ntt", false /*=measure*/, 0 /*=iters*/); // warmup
-//   run(s_reference_target, out_ref.get(), "V3ntt", VERBOSE /*=measure*/, 10 /*=iters*/);
-//   run(s_main_target, out_main.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/);
-//   // std::cout << "left:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << out_main[i] << ", "; } std::cout
-//   <<out_main[total_size-1]<<"]"<< std::endl;
-//   // std::cout << "right:\t["; for (int i = 0; i < total_size-1; i++) { std::cout << out_ref[i] << ", "; } std::cout
-//   <<out_ref[total_size-1]<<"]"<< std::endl;
-
-//   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t)));
-// }
-// #endif // NTT
+  int seed = time(0);
+  srand(seed);
+  ICICLE_LOG_DEBUG << "seed = " << seed;
+  const bool inplace = rand() % 2;
+  const int logn = rand() % 15 + 3;
+  const uint64_t N = 1 << logn;
+  const int log_ntt_domain_size = logn + 1;
+  const int log_batch_size = rand() % 3;
+  const int batch_size = 1 << log_batch_size;
+  const int _ordering = rand() % 4;
+  const Ordering ordering = static_cast<Ordering>(_ordering);
+  bool columns_batch;
+  if (logn == 7 || logn < 4) {
+    columns_batch = false; // currently not supported (icicle_v3/backend/cuda/src/ntt/ntt.cuh line 578)
+  } else {
+    columns_batch = rand() % 2;
+  }
+  const NTTDir dir = static_cast<NTTDir>(rand() % 2); // 0: forward, 1: inverse
+  const int log_coset_stride = rand() % 3;
+  scalar_t coset_gen;
+  if (log_coset_stride) {
+    coset_gen = scalar_t::omega(logn + log_coset_stride);
+  } else {
+    coset_gen = scalar_t::one();
+  }
+
+  ICICLE_LOG_DEBUG << "N = " << N;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
+  ICICLE_LOG_DEBUG << "inplace = " << inplace;
+  ICICLE_LOG_DEBUG << "ordering = " << _ordering;
+  ICICLE_LOG_DEBUG << "log_coset_stride = " << log_coset_stride;
+
+  const int total_size = N * batch_size;
+  auto scalars = std::make_unique<TypeParam[]>(total_size);
+  FieldApiTest<TypeParam>::random_samples(scalars.get(), total_size);
+  auto out_main = std::make_unique<TypeParam[]>(total_size);
+  auto out_ref = std::make_unique<TypeParam[]>(total_size);
+  auto run = [&](const std::string& dev_type, TypeParam* out, const char* msg, bool measure, int iters) {
+    Device dev = {dev_type, 0};
+    icicle_set_device(dev);
+    icicleStreamHandle stream = nullptr;
+    ICICLE_CHECK(icicle_create_stream(&stream));
+    auto init_domain_config = default_ntt_init_domain_config();
+    init_domain_config.stream = stream;
+    init_domain_config.is_async = false;
+    ConfigExtension ext;
+    ext.set(CudaBackendConfig::CUDA_NTT_FAST_TWIDDLES_MODE, true);
+    init_domain_config.ext = &ext;
+    auto config = default_ntt_config<scalar_t>();
+    config.stream = stream;
+    config.coset_gen = coset_gen;
+    config.batch_size = batch_size;       // default: 1
+    config.columns_batch = columns_batch; // default: false
+    config.ordering = ordering;           // default: kNN
+    config.are_inputs_on_device = true;
+    config.are_outputs_on_device = true;
+    config.is_async = false;
+    ICICLE_CHECK(ntt_init_domain(scalar_t::omega(log_ntt_domain_size), init_domain_config));
+    TypeParam *d_in, *d_out;
+    ICICLE_CHECK(icicle_malloc_async((void**)&d_in, total_size * sizeof(TypeParam), config.stream));
+    ICICLE_CHECK(icicle_malloc_async((void**)&d_out, total_size * sizeof(TypeParam), config.stream));
+    ICICLE_CHECK(icicle_copy_to_device_async(d_in, scalars.get(), total_size * sizeof(TypeParam), config.stream));
+    std::ostringstream oss;
+    oss << dev_type << " " << msg;
+    START_TIMER(NTT_sync)
+    for (int i = 0; i < iters; ++i) {
+      if (inplace) {
+        ICICLE_CHECK(ntt(d_in, N, dir, config, d_in));
+      } else {
+        ICICLE_CHECK(ntt(d_in, N, dir, config, d_out));
+      }
+    }
+    END_TIMER(NTT_sync, oss.str().c_str(), measure);
+
+    if (inplace) {
+      ICICLE_CHECK(icicle_copy_to_host_async(out, d_in, total_size * sizeof(TypeParam), config.stream));
+    } else {
+      ICICLE_CHECK(icicle_copy_to_host_async(out, d_out, total_size * sizeof(TypeParam), config.stream));
+    }
+    ICICLE_CHECK(icicle_free_async(d_in, config.stream));
+    ICICLE_CHECK(icicle_free_async(d_out, config.stream));
+    ICICLE_CHECK(icicle_stream_synchronize(config.stream));
+    ICICLE_CHECK(icicle_destroy_stream(stream));
+    ICICLE_CHECK(ntt_release_domain<scalar_t>());
+  };
+  run(s_main_target, out_main.get(), "ntt", false /*=measure*/, 10 /*=iters*/); // warmup
+  run(s_reference_target, out_ref.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/);
+  run(s_main_target, out_main.get(), "ntt", VERBOSE /*=measure*/, 10 /*=iters*/);
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t)));
+}
+#endif // NTT
 
 int main(int argc, char** argv)
 {

From 5291608f40cecb25a97ab568386fd71a3c638d19 Mon Sep 17 00:00:00 2001
From: hadaringonyama <hadar@ingonyama.com>
Date: Sat, 2 Nov 2024 11:13:54 +0200
Subject: [PATCH 29/43] debug eval bug

---
 icicle/tests/test_field_api.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index e45a3ae0b..faf1d0d8d 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -774,10 +774,15 @@ TYPED_TEST(FieldApiTest, polynomialEval)
   int seed = time(0);
   srand(seed);
   ICICLE_LOG_DEBUG << "seed = " << seed;
-  const uint64_t coeffs_size = 1 << (rand() % 10 + 4);
-  const uint64_t domain_size = 1 << (rand() % 8 + 2);
-  const int batch_size = 1 << (rand() % 5);
-  const bool columns_batch = rand() % 2;
+  // const uint64_t coeffs_size = 1 << (rand() % 10 + 4);
+  // const uint64_t domain_size = 1 << (rand() % 8 + 2);
+  // const int batch_size = 1 << (rand() % 5);
+  // const bool columns_batch = rand() % 2;
+
+  const uint64_t coeffs_size = 3;
+  const uint64_t domain_size = 4;
+  const int batch_size = 1;
+  const bool columns_batch = 0;
 
   ICICLE_LOG_DEBUG << "coeffs_size = " << coeffs_size;
   ICICLE_LOG_DEBUG << "domain_size = " << domain_size;

From b7b26ecf24055e85df4ae20fc0ffc6387695f38c Mon Sep 17 00:00:00 2001
From: hadaringonyama <hadar@ingonyama.com>
Date: Sat, 2 Nov 2024 11:37:27 +0200
Subject: [PATCH 30/43] eval bug solved

---
 icicle/tests/test_field_api.cpp | 42 +++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index faf1d0d8d..ce54247a3 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -774,15 +774,18 @@ TYPED_TEST(FieldApiTest, polynomialEval)
   int seed = time(0);
   srand(seed);
   ICICLE_LOG_DEBUG << "seed = " << seed;
-  // const uint64_t coeffs_size = 1 << (rand() % 10 + 4);
-  // const uint64_t domain_size = 1 << (rand() % 8 + 2);
-  // const int batch_size = 1 << (rand() % 5);
-  // const bool columns_batch = rand() % 2;
+  const uint64_t coeffs_size = 1 << (rand() % 10 + 4);
+  const uint64_t domain_size = 1 << (rand() % 8 + 2);
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
+  // const bool a_on_device = rand() % 2;
+  // const bool b_on_device = rand() % 2;
+  // const bool res_on_device = rand() % 2;
 
-  const uint64_t coeffs_size = 3;
-  const uint64_t domain_size = 4;
-  const int batch_size = 1;
-  const bool columns_batch = 0;
+  // const uint64_t coeffs_size = 3;
+  // const uint64_t domain_size = 4;
+  // const int batch_size = 1;
+  // const bool columns_batch = 0;
 
   ICICLE_LOG_DEBUG << "coeffs_size = " << coeffs_size;
   ICICLE_LOG_DEBUG << "domain_size = " << domain_size;
@@ -790,18 +793,27 @@ TYPED_TEST(FieldApiTest, polynomialEval)
   ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
 
   const int total_coeffs_size = coeffs_size * batch_size;
+  const int total_result_size = domain_size * batch_size;
 
   auto in_coeffs = std::make_unique<TypeParam[]>(total_coeffs_size);
   auto in_domain = std::make_unique<TypeParam[]>(domain_size);
-  auto out_main = std::make_unique<TypeParam[]>(total_coeffs_size);
-  auto out_ref = std::make_unique<TypeParam[]>(total_coeffs_size);
+  auto out_main = std::make_unique<TypeParam[]>(total_result_size);
+  auto out_ref = std::make_unique<TypeParam[]>(total_result_size);
 
   auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) {
     Device dev = {dev_type, 0};
     icicle_set_device(dev);
     auto config = default_vec_ops_config();
     config.batch_size = batch_size;
-    config.columns_batch = columns_batch;
+    // config.is_a_on_device = a_on_device;
+    // config.is_b_on_device = b_on_device;
+    // config.is_result_on_device = res_on_device;
+
+    // if (dev_type == "CUDA") {
+    //   in_coeffs = config.is_a_on_device ? allocate_and_copy_to_device(in_coeffs, total_coeffs_size * sizeof(E), cuda_stream) : in_coeffs;
+    //   in_domain = config.is_b_on_device ? allocate_and_copy_to_device(in_domain, domain_size * sizeof(E), cuda_stream) : in_domain;
+    //   out = config.is_result_on_device ? allocate_and_copy_to_device(out, total_result_size * sizeof(E), cuda_stream) : out;
+    // }
 
     std::ostringstream oss;
     oss << dev_type << " " << msg;
@@ -811,6 +823,12 @@ TYPED_TEST(FieldApiTest, polynomialEval)
       ICICLE_CHECK(polynomial_eval(in_coeffs.get(), coeffs_size, in_domain.get(), domain_size, config, out));
     }
     END_TIMER(polynomialEval, oss.str().c_str(), measure);
+
+    // if (dev_type == "CUDA") {
+    //   if (config.is_a_on_device) cudaFree(in_coeffs);
+    //   if (config.is_b_on_device) cudaFree(in_domain);
+    //   if (config.is_result_on_device) cudaFree(out);
+    // }
   };
 
   FieldApiTest<TypeParam>::random_samples(in_coeffs.get(), total_coeffs_size);
@@ -821,7 +839,7 @@ TYPED_TEST(FieldApiTest, polynomialEval)
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "polynomial_eval", 1);
   if (s_is_cuda_registered) {
     run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "polynomial_eval", 1);
-    ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_coeffs_size * sizeof(TypeParam)));
+    ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_result_size * sizeof(TypeParam)));
   }
 }
 

From baf3eb2e8585ae118b027aab299b323b81f7e932 Mon Sep 17 00:00:00 2001
From: Yuval Shekel <yshekel@gmail.com>
Date: Sun, 3 Nov 2024 17:46:03 +0200
Subject: [PATCH 31/43] removed vec-ops example - doesn't compile and very
 similar to other examples

---
 .../c++/vector-api/.devcontainer/Dockerfile   |  25 ---
 .../.devcontainer/devcontainer.json           |  22 ---
 examples/c++/vector-api/CMakeLists.txt        |  16 --
 examples/c++/vector-api/README.md             |  28 ----
 examples/c++/vector-api/example.cpp           | 142 ------------------
 examples/c++/vector-api/run.sh                |  66 --------
 6 files changed, 299 deletions(-)
 delete mode 100644 examples/c++/vector-api/.devcontainer/Dockerfile
 delete mode 100644 examples/c++/vector-api/.devcontainer/devcontainer.json
 delete mode 100644 examples/c++/vector-api/CMakeLists.txt
 delete mode 100644 examples/c++/vector-api/README.md
 delete mode 100644 examples/c++/vector-api/example.cpp
 delete mode 100755 examples/c++/vector-api/run.sh

diff --git a/examples/c++/vector-api/.devcontainer/Dockerfile b/examples/c++/vector-api/.devcontainer/Dockerfile
deleted file mode 100644
index 64188da96..000000000
--- a/examples/c++/vector-api/.devcontainer/Dockerfile
+++ /dev/null
@@ -1,25 +0,0 @@
-# Make sure NVIDIA Container Toolkit is installed on your host
-
-# Use the specified base image
-FROM nvidia/cuda:12.0.0-devel-ubuntu22.04
-
-# Update and install dependencies
-RUN apt-get update && apt-get install -y \
-    cmake \
-    curl \
-    build-essential \
-    git \
-    libboost-all-dev \
-    && rm -rf /var/lib/apt/lists/*
-
-# Clone Icicle from a GitHub repository
-RUN git clone https://github.com/ingonyama-zk/icicle.git  /icicle
-
-# Set the working directory in the container
-WORKDIR /icicle-example
-
-# Specify the default command for the container
-CMD ["/bin/bash"]
-
-
-
diff --git a/examples/c++/vector-api/.devcontainer/devcontainer.json b/examples/c++/vector-api/.devcontainer/devcontainer.json
deleted file mode 100644
index 490fe90a6..000000000
--- a/examples/c++/vector-api/.devcontainer/devcontainer.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-    "name": "Icicle Examples: polynomial multiplication",
-    "build": {
-        "dockerfile": "Dockerfile"
-    },
-    "runArgs": [
-        "--gpus",
-        "all"
-    ],
-    "postCreateCommand": [
-        "nvidia-smi"
-    ],
-    "customizations": {
-        "vscode": {
-            "extensions": [
-                "ms-vscode.cmake-tools",
-                "ms-python.python",
-                "ms-vscode.cpptools"
-            ]
-        }
-    }
-}
\ No newline at end of file
diff --git a/examples/c++/vector-api/CMakeLists.txt b/examples/c++/vector-api/CMakeLists.txt
deleted file mode 100644
index c32f17f43..000000000
--- a/examples/c++/vector-api/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-cmake_minimum_required(VERSION 3.18)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
-
-project(example)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
-add_executable(example example.cpp)
-target_include_directories(example PRIVATE "../../../icicle/include" "..")
-target_link_directories(example PRIVATE "${CMAKE_SOURCE_DIR}/build/icicle")
-message("${CMAKE_BINARY_DIR}/icicle")
-target_link_libraries(example PRIVATE icicle_curve_bn254 icicle_field_bn254 icicle_device)
-if(BACKEND_DIR)
-  add_compile_definitions(BACKEND_DIR="${BACKEND_DIR}")
-endif()
-
diff --git a/examples/c++/vector-api/README.md b/examples/c++/vector-api/README.md
deleted file mode 100644
index 120156c9f..000000000
--- a/examples/c++/vector-api/README.md
+++ /dev/null
@@ -1,28 +0,0 @@
-# Icicle Example: Vector Operations API
-
-## Key-Takeaway
-
-The Vector Operations API supports the following:
-
- - element-wise vector operations (e.g. addition, multiplication)
- - vector reduction operations (e.g. sum of elements, product of elements)
- - scalar-vector operations (e.g add scalar to vector)
- - matrix operations (e.g. transposition)
- - miscellaneous operations like bit-reversal and slicing. 
- 
- All these operations can be performed on a host or device both synchronously and asynchronously.
-
-## Running the example
-
-```sh
-# for CPU
-./run.sh -d CPU
-# for CUDA
-./run.sh -d CUDA -b /path/to/cuda/backend/install/dir
-```
-
-## What's in the example
-
-1.	`example_element_wise`: examples of element-wise operations
-2.	`example_scalar_vector`: examples of scalar-vector operations
-
diff --git a/examples/c++/vector-api/example.cpp b/examples/c++/vector-api/example.cpp
deleted file mode 100644
index 5c4497a64..000000000
--- a/examples/c++/vector-api/example.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <memory>
-
-#include "icicle/runtime.h"
-#include "icicle/api/bn254.h"
-#include "icicle/utils/log.h"
-
-
-// SP: I understand this code is auto-generated, but I can't get script/gen to work. 
-
-extern "C" eIcicleError bn254_vector_product(
-  const bn254::scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result, uint64_t offset, uint64_t  stride);
-
-extern "C" eIcicleError bn254_vector_sum(
-  const bn254::scalar_t* vec_a, uint64_t n, const VecOpsConfig* config, bn254::scalar_t* result, uint64_t offset, uint64_t  stride);
-
-// SP: end of my changes
-
-using namespace bn254;
-
-#include "examples_utils.h"
-
-void random_samples(scalar_t* res, uint32_t count)
-{
-  for (int i = 0; i < count; i++)
-    res[i] = i < 1000 ? scalar_t::rand_host() : res[i - 1000];
-}
-
-void incremental_values(scalar_t* res, uint32_t count)
-{
-  for (int i = 0; i < count; i++) {
-    res[i] = i ? res[i - 1] + scalar_t::one() : scalar_t::one();
-  }
-}
-
-
-void example_element_wise() {
-  return;
-}
-
-int main(int argc, char** argv)
-{
-  // try_load_and_set_backend_device(argc, argv);
-
-  int N_LOG = 20;
-  int N = 1 << N_LOG;
-  int offset = 1;
-  int stride = 4;
-
-  // on-host data
-  auto h_a = std::make_unique<scalar_t[]>(N);
-  auto h_b = std::make_unique<scalar_t[]>(N);
-  auto h_out = std::make_unique<scalar_t[]>(N);
-
-  random_samples(h_a.get(), N ); 
-  random_samples(h_b.get(), N ); 
-
-  // incremental_values(h_a.get(), N ); 
-  // incremental_values(h_b.get(), N ); 
-
-  // on-device data
-  scalar_t *d_a, *d_b, *d_out;
-
-  DeviceProperties device_props;
-  ICICLE_CHECK(icicle_get_device_properties(device_props));
-  if (!device_props.using_host_memory) {
-    std::cout << "Device isn't using host memory" << std::endl;
-  } else {
-    std::cout << "Device is using host memory" << std::endl;
-  }  
-  
-  ICICLE_CHECK(icicle_malloc((void**)&d_a, sizeof(scalar_t) * N));
-  ICICLE_CHECK(icicle_malloc((void**)&d_b, sizeof(scalar_t) * N));
-  ICICLE_CHECK(icicle_malloc((void**)&d_out, sizeof(scalar_t) * N));
-
-  ICICLE_CHECK(icicle_copy(d_a, h_a.get(), sizeof(scalar_t) * N)); 
-  ICICLE_CHECK(icicle_copy(d_b, h_b.get(), sizeof(scalar_t) * N)); 
-
-  VecOpsConfig h_config{
-    nullptr,
-    false,   // is_a_on_device
-    false,   // is_b_on_device
-    false,   // is_result_on_device
-    false,  // is_async
-    nullptr // ext
-  };
-
-  VecOpsConfig d_config{
-    nullptr,
-    true,   // is_a_on_device
-    true,   // is_b_on_device
-    true,   // is_result_on_device
-    false,  // is_async
-    nullptr // ext
-  };
-
-
-  // Reduction operations
-
-  START_TIMER(baseline_reduce_sum);  
-  h_out[0] = scalar_t::zero();
-  for (uint64_t i = offset; i < N; i=i+stride) {
-    h_out[0] = h_out[0] + h_a[i];
-  }
-  END_TIMER(baseline_reduce_sum, "baseline reduce sum took");
-
-  START_TIMER(reduce_sum);
-  ICICLE_CHECK(bn254_vector_sum(d_a, N, &h_config, d_out, offset, stride));
-  END_TIMER(reduce_sum, "reduce sum took");
-
-
-  std::cout << "h_out: " << h_out[0] << std::endl;
-  std::cout << "d_out: " << d_out[0] << std::endl;
-
-
-  START_TIMER(baseline_reduce_product);  
-  h_out[0] = scalar_t::one();
-  for (uint64_t i = offset; i < N; i = i + stride) {
-    h_out[0] = h_out[0] * h_a[i];
-  }
-  END_TIMER(baseline_reduce_product, "baseline reduce product took");
-
-  
-  START_TIMER(reduce_product);
-  ICICLE_CHECK(bn254_vector_product(d_a, N, &d_config, d_out, offset, stride));
-  END_TIMER(reduce_product, "reduce product took");
-
-
-  std::cout << "h_out: " << h_out[0] << std::endl;
-  std::cout << "d_out: " << d_out[0] << std::endl;
-
-    
-
-  
-
-  ICICLE_CHECK(icicle_free(d_a));
-  ICICLE_CHECK(icicle_free(d_b));
-  ICICLE_CHECK(icicle_free(d_out));
-
-  return 0;
-}
\ No newline at end of file
diff --git a/examples/c++/vector-api/run.sh b/examples/c++/vector-api/run.sh
deleted file mode 100755
index 879390d0a..000000000
--- a/examples/c++/vector-api/run.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-
-# Exit immediately if a command exits with a non-zero status
-set -e
-
-# Function to display usage information
-show_help() {
-  echo "Usage: $0 [-d DEVICE_TYPE] [-b ICICLE_BACKEND_INSTALL_DIR]"
-  echo
-  echo "Options:"
-  echo "  -d DEVICE_TYPE            Specify the device type (default: CPU)"
-  echo "  -b ICICLE_BACKEND_INSTALL_DIR    Specify the backend installation directory (default: empty)"
-  echo "  -h                        Show this help message"
-  exit 0
-}
-
-# Parse command line options
-while getopts ":d:b:h" opt; do
-  case ${opt} in
-    d )
-      DEVICE_TYPE=$OPTARG
-      ;;
-    b )
-      ICICLE_BACKEND_INSTALL_DIR="$(realpath ${OPTARG})"
-      ;;
-    h )
-      show_help
-      ;;
-    \? )
-      echo "Invalid option: -$OPTARG" 1>&2
-      show_help
-      ;;
-    : )
-      echo "Invalid option: -$OPTARG requires an argument" 1>&2
-      show_help
-      ;;
-  esac
-done
-
-# Set default values if not provided
-: "${DEVICE_TYPE:=CPU}"
-: "${ICICLE_BACKEND_INSTALL_DIR:=}"
-
-# Create necessary directories
-mkdir -p build/example
-mkdir -p build/icicle
-
-ICILE_DIR=$(realpath "../../../icicle/")
-ICICLE_CUDA_SOURCE_DIR="${ICILE_DIR}/backend/cuda"
-
-# Build Icicle and the example app that links to it
-if [ "$DEVICE_TYPE" == "CUDA" ] && [ ! -d "${ICICLE_BACKEND_INSTALL_DIR}" ] && [ -d "${ICICLE_CUDA_SOURCE_DIR}" ]; then
-  echo "Building icicle with CUDA backend"
-  cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -DMSM=OFF -DG2=OFF -DECNTT=OFF -DCUDA_BACKEND=local -S "${ICILE_DIR}" -B build/icicle
-  export ICICLE_BACKEND_INSTALL_DIR=$(realpath "build/icicle/backend")
-else
-  echo "Building icicle without CUDA backend, ICICLE_BACKEND_INSTALL_DIR=${ICICLE_BACKEND_INSTALL_DIR}"
-  export ICICLE_BACKEND_INSTALL_DIR="${ICICLE_BACKEND_INSTALL_DIR}"
-  cmake -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -S "${ICILE_DIR}" -B build/icicle
-fi
-cmake -DCMAKE_BUILD_TYPE=Release -S . -B build/example
-
-cmake --build build/icicle -j
-cmake --build build/example -j
-
-./build/example/example "$DEVICE_TYPE"

From 2ed43696dbe970d35de9638eb264be18f32d23f8 Mon Sep 17 00:00:00 2001
From: Yuval Shekel <yshekel@gmail.com>
Date: Sun, 3 Nov 2024 19:55:50 +0200
Subject: [PATCH 32/43] updated poly-div test and poly-eval fix for column mode

---
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp |   6 +-
 icicle/include/icicle/vec_ops.h              |   8 +-
 icicle/tests/test_field_api.cpp              | 230 ++++++-------------
 3 files changed, 73 insertions(+), 171 deletions(-)

diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index 24e53fa59..0e3e7c2d5 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -902,11 +902,7 @@ eIcicleError cpu_poly_divide(
   ICICLE_ASSERT(q_size >= (numerator_deg - denumerator_deg + 1))
     << "polynomial division expects q(x) size to be at least deg(numerator)-deg(denumerator)+1";
 
-  // ICICLE_CHECK(icicle_copy_async(r_out, numerator, r_size * config.batch_size * sizeof(T), config.stream));
-  // copy numerator to r_out // FIXME should it be copied using icicle_copy_async?
-  for (uint64_t i = 0; i < (numerator_deg + 1) * config.batch_size; ++i) {
-    r_out[i] = numerator[i];
-  }
+  memcpy(r_out, numerator, sizeof(T) * numerator_size * config.batch_size);
 
   uint32_t stride = config.columns_batch ? config.batch_size : 1;
   auto deg_r = std::make_unique<int64_t[]>(config.batch_size);
diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h
index 2868aa682..cc317783b 100644
--- a/icicle/include/icicle/vec_ops.h
+++ b/icicle/include/icicle/vec_ops.h
@@ -397,11 +397,11 @@ namespace icicle {
    *                  - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored
    * contiguously.
    *                  - If `config.columns_batch` is `true`, coefficients are interleaved.
-   * @param numerator_deg Degree of the numerator polynomial.
+   * @param numerator_max_deg Maximal degree of the numerator polynomials in the batch.
    * @param numerator_size size (number of T elements) in numerator vec
    * @param denominator Pointer to the array of coefficients of the denominator polynomial(s).
    *                  - Storage layout is similar to `numerator`.
-   * @param denominator_deg Degree of the denominator polynomial.
+   * @param denumerator_max_deg Maximal degree of the denominator polynomials in the batch.
    * @param denominator_size size (number of T elements) in denumerator vec
    * @param config Configuration for the operation.
    * @param q_size Size of the quotient array for one polynomial.
@@ -420,10 +420,10 @@ namespace icicle {
   template <typename T>
   eIcicleError polynomial_division(
     const T* numerator,
-    int64_t numerator_deg,
+    int64_t numerator_max_deg,
     uint64_t numerator_size,
     const T* denumerator,
-    int64_t denumerator_deg,
+    int64_t denumerator_max_deg,
     uint64_t denumerator_size,
     uint64_t q_size,
     uint64_t r_size,
diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index ce54247a3..62c3a75bf 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -17,7 +17,7 @@
 using namespace field_config;
 using namespace icicle;
 
-// TODO - add tests that test different configurations of data on device or on host.
+// TODO Hadar - add tests that test different configurations of data on device or on host.
 
 using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
@@ -30,7 +30,8 @@ static bool VERBOSE = true;
 static int ITERS = 1;
 static inline std::string s_main_target;
 static inline std::string s_reference_target;
-bool s_is_cuda_registered;
+static inline std::vector<std::string> s_registered_devices;
+bool s_is_cuda_registered; // TODO Yuval remove this
 
 template <typename T>
 class FieldApiTest : public ::testing::Test
@@ -48,6 +49,7 @@ class FieldApiTest : public ::testing::Test
     if (!s_is_cuda_registered) { ICICLE_LOG_ERROR << "CUDA device not found. Testing CPU vs reference (on cpu)"; }
     s_main_target = s_is_cuda_registered ? "CUDA" : "CPU";
     s_reference_target = "CPU";
+    s_registered_devices = get_registered_devices_list();
   }
   static void TearDownTestSuite()
   {
@@ -436,9 +438,11 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
   srand(seed);
   ICICLE_LOG_DEBUG << "seed = " << seed;
   const int R =
-    1 << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes which are not powers of 2
+    1
+    << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes which are not powers of 2
   const int C =
-    1 << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes which are not powers of 2
+    1
+    << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes which are not powers of 2
   const int batch_size = 1 << (rand() % 4);
   const bool columns_batch = rand() % 2;
   const bool is_in_place =
@@ -727,11 +731,12 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx)
   const bool columns_batch = rand() % 2;
   const int total_size = N * batch_size;
 
-  ICICLE_LOG_DEBUG << "N = " << N;
-  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
-  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
-
   auto in_a = std::make_unique<TypeParam[]>(total_size);
+  for (int i = 0; i < batch_size; ++i) {
+    // randomize different rows with zeros in the end
+    auto size = std::max(int64_t(N) / 4 - i, int64_t(1));
+    scalar_t::rand_host_many(in_a.get() + i * N, size);
+  }
   auto out_main = std::make_unique<int64_t[]>(batch_size);
   auto out_ref = std::make_unique<int64_t[]>(batch_size);
 
@@ -752,20 +757,8 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx)
     END_TIMER(highestNonZeroIdx, oss.str().c_str(), measure);
   };
 
-  // Initialize each entire vector with 1 at a random index. The highest non-zero index is the index with 1
-  for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
-    out_ref[idx_in_batch] = static_cast<int64_t>(rand() % N); // highest_non_zero_idx
-    for (uint32_t i = 0; i < N; i++) {
-      if (columns_batch) {
-        in_a[idx_in_batch + batch_size * i] = TypeParam::from(i == out_ref[idx_in_batch] ? 1 : 0);
-      } else {
-        in_a[idx_in_batch * N + i] = TypeParam::from(i == out_ref[idx_in_batch] ? 1 : 0);
-      }
-    }
-  }
-  if (s_is_cuda_registered) { run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "highest_non_zero_idx", 1); }
+  run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "highest_non_zero_idx", 1);
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "highest_non_zero_idx", 1);
-
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(int64_t)));
 }
 
@@ -778,14 +771,6 @@ TYPED_TEST(FieldApiTest, polynomialEval)
   const uint64_t domain_size = 1 << (rand() % 8 + 2);
   const int batch_size = 1 << (rand() % 5);
   const bool columns_batch = rand() % 2;
-  // const bool a_on_device = rand() % 2;
-  // const bool b_on_device = rand() % 2;
-  // const bool res_on_device = rand() % 2;
-
-  // const uint64_t coeffs_size = 3;
-  // const uint64_t domain_size = 4;
-  // const int batch_size = 1;
-  // const bool columns_batch = 0;
 
   ICICLE_LOG_DEBUG << "coeffs_size = " << coeffs_size;
   ICICLE_LOG_DEBUG << "domain_size = " << domain_size;
@@ -805,15 +790,7 @@ TYPED_TEST(FieldApiTest, polynomialEval)
     icicle_set_device(dev);
     auto config = default_vec_ops_config();
     config.batch_size = batch_size;
-    // config.is_a_on_device = a_on_device;
-    // config.is_b_on_device = b_on_device;
-    // config.is_result_on_device = res_on_device;
-
-    // if (dev_type == "CUDA") {
-    //   in_coeffs = config.is_a_on_device ? allocate_and_copy_to_device(in_coeffs, total_coeffs_size * sizeof(E), cuda_stream) : in_coeffs;
-    //   in_domain = config.is_b_on_device ? allocate_and_copy_to_device(in_domain, domain_size * sizeof(E), cuda_stream) : in_domain;
-    //   out = config.is_result_on_device ? allocate_and_copy_to_device(out, total_result_size * sizeof(E), cuda_stream) : out;
-    // }
+    config.columns_batch = columns_batch;
 
     std::ostringstream oss;
     oss << dev_type << " " << msg;
@@ -823,138 +800,67 @@ TYPED_TEST(FieldApiTest, polynomialEval)
       ICICLE_CHECK(polynomial_eval(in_coeffs.get(), coeffs_size, in_domain.get(), domain_size, config, out));
     }
     END_TIMER(polynomialEval, oss.str().c_str(), measure);
-
-    // if (dev_type == "CUDA") {
-    //   if (config.is_a_on_device) cudaFree(in_coeffs);
-    //   if (config.is_b_on_device) cudaFree(in_domain);
-    //   if (config.is_result_on_device) cudaFree(out);
-    // }
   };
 
   FieldApiTest<TypeParam>::random_samples(in_coeffs.get(), total_coeffs_size);
   FieldApiTest<TypeParam>::random_samples(in_domain.get(), domain_size);
 
-  // Reference implementation - TODO
-
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "polynomial_eval", 1);
-  if (s_is_cuda_registered) {
-    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "polynomial_eval", 1);
-    ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_result_size * sizeof(TypeParam)));
-  }
+  run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "polynomial_eval", 1);
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_result_size * sizeof(TypeParam)));
 }
 
-// TYPED_TEST(FieldApiTest, polynomialDivision)
-// {
-//   int seed = time(0);
-//   srand(seed);
-//   ICICLE_LOG_DEBUG << "seed = " << seed;
-//   // const int64_t numerator_deg = 1 << 4;
-//   // const int64_t denumerator_deg = 1 << 2;
-//   // const uint64_t q_size = numerator_deg - denumerator_deg + 1;
-//   // const uint64_t r_size = numerator_deg + 1;
-//   const int64_t numerator_deg = 3;
-//   const int64_t denumerator_deg = 2;
-//   const uint64_t q_size = 2;
-//   const uint64_t r_size = 4;
-//   // const int batch_size = 1 << (rand() % 5);
-//   const int batch_size = 1;
-//   const bool columns_batch = rand() % 2;
-
-//   const int64_t total_numerator_size = (numerator_deg + 1) * batch_size;
-//   const int64_t total_denumerator_size = (denumerator_deg + 1) * batch_size;
-//   const uint64_t total_q_size = q_size * batch_size;
-//   const uint64_t total_r_size = r_size * batch_size;
-
-//   auto numerator = std::make_unique<TypeParam[]>(total_numerator_size);
-//   auto denumerator = std::make_unique<TypeParam[]>(total_denumerator_size);
-//   auto q_out_main = std::make_unique<TypeParam[]>(total_q_size);
-//   auto r_out_main = std::make_unique<TypeParam[]>(total_r_size);
-//   auto q_out_ref = std::make_unique<TypeParam[]>(total_q_size);
-//   auto r_out_ref = std::make_unique<TypeParam[]>(total_r_size);
-
-//   auto run =
-//     [&](const std::string& dev_type, TypeParam* q_out, TypeParam* r_out, bool measure, const char* msg, int iters) {
-//       Device dev = {dev_type, 0};
-//       icicle_set_device(dev);
-//       auto config = default_vec_ops_config();
-//       config.batch_size = batch_size;
-//       config.columns_batch = columns_batch;
-
-//       std::ostringstream oss;
-//       oss << dev_type << " " << msg;
-
-//       START_TIMER(polynomialDivision)
-//       for (int i = 0; i < iters; ++i) {
-//         ICICLE_CHECK(polynomial_division(
-//           numerator.get(), numerator_deg, total_numerator_size, denumerator.get(), denumerator_deg,
-//           total_denumerator_size, q_size, r_size, config, q_out, r_out));
-//       }
-//       END_TIMER(polynomialDivision, oss.str().c_str(), measure);
-//     };
-
-//   // // Option 1: Initialize input vectors with random values
-//   // FieldApiTest<TypeParam>::random_samples(numerator.get(), total_numerator_size);
-//   // FieldApiTest<TypeParam>::random_samples(denumerator.get(), total_denumerator_size);
-//   // // Reference implementation
-//   // TODO - Check in comperison with GPU implementation or implement a general reference implementation
-
-//   // Option 2: Initialize the numerator and denumerator with chosen example
-//   //           And the reference implementation for the example
-
-//   for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
-//     if (columns_batch) {
-//       // numerator = 3x^3+4x^2+5
-//       numerator[idx_in_batch + 0 * batch_size] = TypeParam::from(5);
-//       numerator[idx_in_batch + 1 * batch_size] = TypeParam::from(0);
-//       numerator[idx_in_batch + 2 * batch_size] = TypeParam::from(4);
-//       numerator[idx_in_batch + 3 * batch_size] = TypeParam::from(3);
-//       // denumerator = x^2-1
-//       denumerator[idx_in_batch + 0 * batch_size] = TypeParam::from(0) - TypeParam::from(1);
-//       denumerator[idx_in_batch + 1 * batch_size] = TypeParam::from(0);
-//       denumerator[idx_in_batch + 2 * batch_size] = TypeParam::from(1);
-//       if (!s_is_cuda_registered) {
-//         // q_out_ref = 3x+4
-//         q_out_ref[idx_in_batch + 0 * batch_size] = TypeParam::from(4);
-//         q_out_ref[idx_in_batch + 1 * batch_size] = TypeParam::from(3);
-//         // r_out_ref = 3x+9
-//         r_out_ref[idx_in_batch + 0 * batch_size] = TypeParam::from(9);
-//         r_out_ref[idx_in_batch + 1 * batch_size] = TypeParam::from(3);
-//       }
-//     } else {
-//       // numerator = 3x^3+4x^2+5
-//       numerator[idx_in_batch * (numerator_deg + 1) + 0] = TypeParam::from(5);
-//       numerator[idx_in_batch * (numerator_deg + 1) + 1] = TypeParam::from(0);
-//       numerator[idx_in_batch * (numerator_deg + 1) + 2] = TypeParam::from(4);
-//       numerator[idx_in_batch * (numerator_deg + 1) + 3] = TypeParam::from(3);
-//       // denumerator = x^2-1
-//       denumerator[idx_in_batch * (denumerator_deg + 1) + 0] = TypeParam::from(0) - TypeParam::from(1);
-//       denumerator[idx_in_batch * (denumerator_deg + 1) + 1] = TypeParam::from(0);
-//       denumerator[idx_in_batch * (denumerator_deg + 1) + 2] = TypeParam::from(1);
-//       if (!s_is_cuda_registered) {
-//         // q_out_ref = 3x+4
-//         q_out_ref[idx_in_batch * q_size + 0] = TypeParam::from(4);
-//         q_out_ref[idx_in_batch * q_size + 1] = TypeParam::from(3);
-//         // r_out_ref = 3x+9
-//         r_out_ref[idx_in_batch * r_size + 0] = TypeParam::from(9);
-//         r_out_ref[idx_in_batch * r_size + 1] = TypeParam::from(3);
-//       }
-//     }
-//   }
-
-//   if (s_is_cuda_registered) {
-//     run(s_reference_target, q_out_ref.get(), r_out_ref.get(), VERBOSE /*=measure*/, "polynomial_division", 1);
-//   }
-//   std::cout << "numerator:\t["; for (int i = 0; i < total_numerator_size-1; i++) { std::cout << numerator[i] << ", ";
-//   } std::cout <<numerator[total_numerator_size-1]<<"]"<< std::endl; std::cout << "denumerator:\t["; for (int i = 0; i
-//   < total_denumerator_size-1; i++) { std::cout << denumerator[i] << ", "; } std::cout
-//   <<denumerator[total_denumerator_size-1]<<"]"<< std::endl; std::cout << "q_out_ref:\t["; for (int i = 0; i <
-//   total_q_size-1; i++) { std::cout <<  q_out_ref[i] << ", "; } std::cout << q_out_ref[total_q_size-1]<<"]"<<
-//   std::endl; std::cout << "r_out_ref:\t["; for (int i = 0; i < total_r_size-1; i++) { std::cout <<  r_out_ref[i] <<
-//   ", "; } std::cout << r_out_ref[total_r_size-1]<<"]"<< std::endl;
-//   run(s_main_target, q_out_main.get(), r_out_main.get(), VERBOSE /*=measure*/, "polynomial_division", 1);
-//   ASSERT_EQ(0, memcmp(q_out_main.get(), q_out_ref.get(), total_q_size * sizeof(TypeParam)));
-//   ASSERT_EQ(0, memcmp(r_out_main.get(), r_out_ref.get(), total_r_size * sizeof(TypeParam)));
-// }
+TYPED_TEST(FieldApiTest, polynomialDivision)
+{
+  const uint64_t numerator_size = 1 << 4;
+  const uint64_t denumerator_size = 1 << 3;
+  const int64_t max_num_deg = numerator_size - 1;
+  const int64_t max_denum_deg = denumerator_size - 1;
+  const uint64_t q_size = max_num_deg - max_denum_deg + 1;
+  const uint64_t r_size = max_num_deg + 1;
+  const int batch_size = 10 + rand() % 10;
+
+  // basically we compute q(x),r(x) for a(x)=q(x)b(x)+r(x) by dividing a(x)/b(x)
+
+  // randomize matrix with rows/cols as polynomials
+  auto numerator = std::make_unique<TypeParam[]>(numerator_size * batch_size);
+  auto denumerator = std::make_unique<TypeParam[]>(denumerator_size * batch_size);
+  TypeParam::rand_host_many(numerator.get(), numerator_size * batch_size);
+  TypeParam::rand_host_many(denumerator.get(), denumerator_size * batch_size);
+
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    for (int columns_batch = 0; columns_batch <= 1; columns_batch++) {
+      ICICLE_LOG_DEBUG << "testing polynomial division on device " << device << " [column_batch=" << columns_batch
+                       << "]";
+      auto q = std::make_unique<TypeParam[]>(q_size * batch_size);
+      auto r = std::make_unique<TypeParam[]>(r_size * batch_size);
+
+      auto config = default_vec_ops_config();
+      config.batch_size = batch_size;
+      config.columns_batch = columns_batch;
+      ICICLE_CHECK(polynomial_division(
+        numerator.get(), max_num_deg, numerator_size, denumerator.get(), max_denum_deg, denumerator_size, q_size,
+        r_size, config, q.get(), r.get()));
+
+      // test a(x)=q(x)b(x)+r(x) in random point
+      const auto rand_x = TypeParam::rand_host();
+      auto ax = std::make_unique<TypeParam[]>(batch_size);
+      auto bx = std::make_unique<TypeParam[]>(batch_size);
+      auto qx = std::make_unique<TypeParam[]>(batch_size);
+      auto rx = std::make_unique<TypeParam[]>(batch_size);
+      polynomial_eval(numerator.get(), numerator_size, &rand_x, 1, config, ax.get());
+      polynomial_eval(denumerator.get(), denumerator_size, &rand_x, 1, config, bx.get());
+      polynomial_eval(q.get(), q_size, &rand_x, 1, config, qx.get());
+      polynomial_eval(r.get(), r_size, &rand_x, 1, config, rx.get());
+
+      for (int i = 0; i < batch_size; ++i) {
+        // ICICLE_LOG_DEBUG << "ax=" << ax[i] << ", bx=" << bx[i] << ", qx=" << qx[i] << ", rx=" << rx[i];
+        ASSERT_EQ(ax[i], qx[i] * bx[i] + rx[i]);
+      }
+    }
+  }
+}
 
 #ifdef NTT
 

From b7d62c8910c13bc9085de5514a4e2f78e8b07c8a Mon Sep 17 00:00:00 2001
From: Yuval Shekel <yshekel@gmail.com>
Date: Mon, 4 Nov 2024 17:35:26 +0200
Subject: [PATCH 33/43] updated for poly div

---
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp  | 59 ++++++++++---------
 .../include/icicle/backend/vec_ops_backend.h  | 12 ++--
 .../default_backend/default_poly_backend.h    |  2 +-
 icicle/include/icicle/vec_ops.h               | 30 +++-------
 icicle/src/vec_ops.cpp                        | 50 ++++------------
 icicle/tests/test_field_api.cpp               | 45 +++++++++-----
 6 files changed, 84 insertions(+), 114 deletions(-)

diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index 0e3e7c2d5..b22c1ade2 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -886,43 +886,48 @@ template <typename T>
 eIcicleError cpu_poly_divide(
   const Device& device,
   const T* numerator,
-  int64_t numerator_deg,
   uint64_t numerator_size,
-  const T* denumerator,
-  int64_t denumerator_deg,
-  uint64_t denumerator_size,
-  uint64_t q_size,
-  uint64_t r_size,
+  const T* denominator,
+  uint64_t denominator_size,
   const VecOpsConfig& config,
   T* q_out /*OUT*/,
-  T* r_out /*OUT*/)
+  uint64_t q_size,
+  T* r_out /*OUT*/,
+  uint64_t r_size)
 {
-  ICICLE_ASSERT(r_size >= numerator_deg)
-    << "polynomial division expects r(x) size to be similar to numerator size and higher than numerator degree(x)";
-  ICICLE_ASSERT(q_size >= (numerator_deg - denumerator_deg + 1))
-    << "polynomial division expects q(x) size to be at least deg(numerator)-deg(denumerator)+1";
-
-  memcpy(r_out, numerator, sizeof(T) * numerator_size * config.batch_size);
+  if (config.batch_size != 1 && config.columns_batch) {
+    ICICLE_LOG_ERROR << "polynomial division is not implemented for column batch. Planned for v3.2";
+    return eIcicleError::API_NOT_IMPLEMENTED;
+  }
 
   uint32_t stride = config.columns_batch ? config.batch_size : 1;
-  auto deg_r = std::make_unique<int64_t[]>(config.batch_size);
   for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) {
-    const T* curr_denumerator = config.columns_batch
-                                  ? denumerator + idx_in_batch
-                                  : denumerator + idx_in_batch * (denumerator_deg + 1); // Pointer to the current vector
-    T* curr_q_out =
-      config.columns_batch ? q_out + idx_in_batch : q_out + idx_in_batch * q_size; // Pointer to the current vector
-    T* curr_r_out =
-      config.columns_batch ? r_out + idx_in_batch : r_out + idx_in_batch * r_size; // Pointer to the current vector
+    const T* curr_numerator =
+      config.columns_batch ? numerator + idx_in_batch : numerator + idx_in_batch * numerator_size;
+    const T* curr_denominator =
+      config.columns_batch ? denominator + idx_in_batch : denominator + idx_in_batch * denominator_size;
+    T* curr_q_out = config.columns_batch ? q_out + idx_in_batch : q_out + idx_in_batch * q_size;
+    T* curr_r_out = config.columns_batch ? r_out + idx_in_batch : r_out + idx_in_batch * r_size;
+
+    int64_t numerator_deg, denominator_deg;
+    cpu_highest_non_zero_idx(device, curr_numerator, numerator_size, default_vec_ops_config(), &numerator_deg);
+    cpu_highest_non_zero_idx(device, curr_denominator, denominator_size, default_vec_ops_config(), &denominator_deg);
+    ICICLE_ASSERT(r_size >= numerator_deg + 1)
+      << "polynomial division expects r(x) size to be similar to numerator size and higher than numerator degree(x)";
+    ICICLE_ASSERT(q_size >= (numerator_deg - denominator_deg + 1))
+      << "polynomial division expects q(x) size to be at least deg(numerator)-deg(denominator)+1";
+
+    memset(curr_r_out, 0, sizeof(T) * r_size);
+    memcpy(curr_r_out, curr_numerator, sizeof(T) * (numerator_deg + 1));
+
     // invert largest coeff of b
-    const T& lc_b_inv = T::inverse(curr_denumerator[denumerator_deg * stride]);
-    deg_r[idx_in_batch] = numerator_deg;
-    while (deg_r[idx_in_batch] >= denumerator_deg) {
+    const T& lc_b_inv = T::inverse(curr_denominator[denominator_deg * stride]);
+    int64_t deg_r = numerator_deg;
+    while (deg_r >= denominator_deg) {
       // each iteration is removing the largest monomial in r until deg(r)<deg(b)
-      school_book_division_step_cpu(
-        curr_r_out, curr_q_out, curr_denumerator, deg_r[idx_in_batch], denumerator_deg, lc_b_inv, stride);
+      school_book_division_step_cpu(curr_r_out, curr_q_out, curr_denominator, deg_r, denominator_deg, lc_b_inv, stride);
       // compute degree of r
-      cpu_highest_non_zero_idx(device, r_out, r_size, config, deg_r.get());
+      cpu_highest_non_zero_idx(device, r_out, deg_r, default_vec_ops_config(), &deg_r);
     }
   }
   return eIcicleError::SUCCESS;
diff --git a/icicle/include/icicle/backend/vec_ops_backend.h b/icicle/include/icicle/backend/vec_ops_backend.h
index 04f7ed73f..69b64c893 100644
--- a/icicle/include/icicle/backend/vec_ops_backend.h
+++ b/icicle/include/icicle/backend/vec_ops_backend.h
@@ -73,16 +73,14 @@ namespace icicle {
   using scalarPolyDivImpl = std::function<eIcicleError(
     const Device& device,
     const scalar_t* numerator,
-    int64_t numerator_deg,
     uint64_t numerator_size,
-    const scalar_t* denumerator,
-    int64_t denumerator_deg,
-    uint64_t denumerator_size,
-    uint64_t q_size,
-    uint64_t r_size,
+    const scalar_t* denominator,
+    uint64_t denominator_size,
     const VecOpsConfig& config,
     scalar_t* q_out /*OUT*/,
-    scalar_t* r_out /*OUT*/)>;
+    uint64_t q_size,
+    scalar_t* r_out /*OUT*/,
+    uint64_t r_size)>;
 
   void register_vector_add(const std::string& deviceType, vectorVectorOpImpl impl);
 
diff --git a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
index a42c87317..bfa57f9c3 100644
--- a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
+++ b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
@@ -278,7 +278,7 @@ namespace icicle {
       config.is_result_on_device = true;
 
       ICICLE_CHECK(icicle::polynomial_division(
-        a_coeffs, deg_a, a_N, b_coeffs, deg_b, b_N, deg_a - deg_b + 1, a_N, config, Q_coeffs, R_coeffs));
+        a_coeffs, deg_a + 1, b_coeffs, deg_b + 1, config, Q_coeffs, deg_a - deg_b + 1, R_coeffs, a_N));
     }
 
     void quotient(PolyContext Q, PolyContext op_a, PolyContext op_b) override
diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h
index cc317783b..38551ab6a 100644
--- a/icicle/include/icicle/vec_ops.h
+++ b/icicle/include/icicle/vec_ops.h
@@ -397,47 +397,31 @@ namespace icicle {
    *                  - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored
    * contiguously.
    *                  - If `config.columns_batch` is `true`, coefficients are interleaved.
-   * @param numerator_max_deg Maximal degree of the numerator polynomials in the batch.
-   * @param numerator_size size (number of T elements) in numerator vec
+   * @param numerator_size size (number of T elements) in numerator vec of a single batch element
    * @param denominator Pointer to the array of coefficients of the denominator polynomial(s).
    *                  - Storage layout is similar to `numerator`.
-   * @param denumerator_max_deg Maximal degree of the denominator polynomials in the batch.
-   * @param denominator_size size (number of T elements) in denumerator vec
+   * @param denominator_size size (number of T elements) in denominator vec of a single batch element
    * @param config Configuration for the operation.
-   * @param q_size Size of the quotient array for one polynomial.
-   * @param r_size Size of the remainder array.
    * @param q_out Pointer to the array where the quotient polynomial(s) will be stored. This is an output parameter.
    *              - The storage layout should match that of `numerator`.
+   * @param q_size Size of the quotient array for one polynomial.
    * @param r_out Pointer to the array where the remainder polynomial(s) will be stored. This is an output parameter.
    *              - The storage layout should match that of `numerator`.
    *              - The size of `r_out` should be sufficient to hold the remainder coefficients for each polynomial.
+   * @param r_size Size of the remainder array.
    * @return eIcicleError Error code indicating success or failure.
    *
    * @note The degrees should satisfy `numerator_deg >= denominator_deg`.
    *       The sizes `q_size` and `r_size` must be at least `numerator_deg - denominator_deg + 1` and `denominator_deg`,
    * respectively. The function assumes that the input and output arrays are properly allocated.
    */
-  template <typename T>
-  eIcicleError polynomial_division(
-    const T* numerator,
-    int64_t numerator_max_deg,
-    uint64_t numerator_size,
-    const T* denumerator,
-    int64_t denumerator_max_deg,
-    uint64_t denumerator_size,
-    uint64_t q_size,
-    uint64_t r_size,
-    const VecOpsConfig& config,
-    T* q_out /*OUT*/,
-    T* r_out /*OUT*/);
 
-  // deprecated API
   template <typename T>
   eIcicleError polynomial_division(
     const T* numerator,
-    int64_t numerator_deg,
-    const T* denumerator,
-    int64_t denumerator_deg,
+    uint64_t numerator_size,
+    const T* denominator,
+    uint64_t denominator_size,
     const VecOpsConfig& config,
     T* q_out /*OUT*/,
     uint64_t q_size,
diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp
index c8b867470..ebb86e0c1 100644
--- a/icicle/src/vec_ops.cpp
+++ b/icicle/src/vec_ops.cpp
@@ -419,63 +419,33 @@ namespace icicle {
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, poly_division)(
     const scalar_t* numerator,
-    int64_t numerator_deg,
     uint64_t numerator_size,
-    const scalar_t* denumerator,
-    int64_t denumerator_deg,
-    uint64_t denumerator_size,
-    uint64_t q_size,
-    uint64_t r_size,
+    const scalar_t* denominator,
+    int64_t denominator_size,
     const VecOpsConfig& config,
     scalar_t* q_out /*OUT*/,
-    scalar_t* r_out /*OUT*/)
+    uint64_t q_size,
+    scalar_t* r_out /*OUT*/,
+    uint64_t r_size)
   {
     return ScalarPolyDivDispatcher::execute(
-      numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, config,
-      q_out, r_out);
+      numerator, numerator_size, denominator, denominator_size, config, q_out, q_size, r_out, r_size);
   }
 
   template <>
   eIcicleError polynomial_division(
     const scalar_t* numerator,
-    int64_t numerator_deg,
     uint64_t numerator_size,
-    const scalar_t* denumerator,
-    int64_t denumerator_deg,
-    uint64_t denumerator_size,
-    uint64_t q_size,
-    uint64_t r_size,
-    const VecOpsConfig& config,
-    scalar_t* q_out /*OUT*/,
-    scalar_t* r_out /*OUT*/)
-  {
-    return CONCAT_EXPAND(FIELD, poly_division)(
-      numerator, numerator_deg, numerator_size, denumerator, denumerator_deg, denumerator_size, q_size, r_size, config,
-      q_out, r_out);
-  }
-
-  // Deprecated API
-  template <>
-  eIcicleError polynomial_division(
-    const scalar_t* numerator,
-    int64_t numerator_deg,
-    const scalar_t* denumerator,
-    int64_t denumerator_deg,
+    const scalar_t* denominator,
+    uint64_t denominator_size,
     const VecOpsConfig& config,
     scalar_t* q_out /*OUT*/,
     uint64_t q_size,
     scalar_t* r_out /*OUT*/,
     uint64_t r_size)
   {
-    ICICLE_LOG_WARNING
-      << "polynomial_division api is deprecated and replace with new api. Use new polynomial_division api instead";
-    if (config.batch_size != 1) {
-      ICICLE_LOG_ERROR << "deprecated polynomial_division API does not support batch";
-      return eIcicleError::INVALID_ARGUMENT;
-    }
-    return polynomial_division(
-      numerator, numerator_deg, numerator_deg + 1, denumerator, denumerator_deg, denumerator_deg + 1, q_size, r_size,
-      config, q_out, r_out);
+    return CONCAT_EXPAND(FIELD, poly_division)(
+      numerator, numerator_size, denominator, denominator_size, config, q_out, q_size, r_out, r_size);
   }
 
 } // namespace icicle
\ No newline at end of file
diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index 62c3a75bf..6ae5a414c 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -813,20 +813,27 @@ TYPED_TEST(FieldApiTest, polynomialEval)
 TYPED_TEST(FieldApiTest, polynomialDivision)
 {
   const uint64_t numerator_size = 1 << 4;
-  const uint64_t denumerator_size = 1 << 3;
-  const int64_t max_num_deg = numerator_size - 1;
-  const int64_t max_denum_deg = denumerator_size - 1;
-  const uint64_t q_size = max_num_deg - max_denum_deg + 1;
-  const uint64_t r_size = max_num_deg + 1;
+  const uint64_t denominator_size = 1 << 3;
+  const uint64_t q_size = numerator_size - denominator_size + 1;
+  const uint64_t r_size = numerator_size;
   const int batch_size = 10 + rand() % 10;
 
   // basically we compute q(x),r(x) for a(x)=q(x)b(x)+r(x) by dividing a(x)/b(x)
 
   // randomize matrix with rows/cols as polynomials
   auto numerator = std::make_unique<TypeParam[]>(numerator_size * batch_size);
-  auto denumerator = std::make_unique<TypeParam[]>(denumerator_size * batch_size);
+  auto denominator = std::make_unique<TypeParam[]>(denominator_size * batch_size);
   TypeParam::rand_host_many(numerator.get(), numerator_size * batch_size);
-  TypeParam::rand_host_many(denumerator.get(), denumerator_size * batch_size);
+  TypeParam::rand_host_many(denominator.get(), denominator_size * batch_size);
+
+  // Add padding to each row so that the degree is lower than the size
+  const int zero_pad_length = 5;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < zero_pad_length; ++j) {
+      numerator[i * numerator_size + numerator_size - zero_pad_length + j] = TypeParam::zero();
+      denominator[i * denominator_size + denominator_size - zero_pad_length + j] = TypeParam::zero();
+    }
+  }
 
   for (auto device : s_registered_devices) {
     ICICLE_CHECK(icicle_set_device(device));
@@ -837,24 +844,30 @@ TYPED_TEST(FieldApiTest, polynomialDivision)
       auto r = std::make_unique<TypeParam[]>(r_size * batch_size);
 
       auto config = default_vec_ops_config();
-      config.batch_size = batch_size;
+      config.batch_size = columns_batch ? batch_size - zero_pad_length : batch_size; // skip the zero cols
       config.columns_batch = columns_batch;
+      // TODO v3.2 support column batch for this API
+      if (columns_batch) {
+        ICICLE_LOG_INFO << "Skipping polynomial division column batch";
+        continue;
+      }
+
       ICICLE_CHECK(polynomial_division(
-        numerator.get(), max_num_deg, numerator_size, denumerator.get(), max_denum_deg, denumerator_size, q_size,
-        r_size, config, q.get(), r.get()));
+        numerator.get(), numerator_size, denominator.get(), denominator_size, config, q.get(), q_size, r.get(),
+        r_size));
 
       // test a(x)=q(x)b(x)+r(x) in random point
       const auto rand_x = TypeParam::rand_host();
-      auto ax = std::make_unique<TypeParam[]>(batch_size);
-      auto bx = std::make_unique<TypeParam[]>(batch_size);
-      auto qx = std::make_unique<TypeParam[]>(batch_size);
-      auto rx = std::make_unique<TypeParam[]>(batch_size);
+      auto ax = std::make_unique<TypeParam[]>(config.batch_size);
+      auto bx = std::make_unique<TypeParam[]>(config.batch_size);
+      auto qx = std::make_unique<TypeParam[]>(config.batch_size);
+      auto rx = std::make_unique<TypeParam[]>(config.batch_size);
       polynomial_eval(numerator.get(), numerator_size, &rand_x, 1, config, ax.get());
-      polynomial_eval(denumerator.get(), denumerator_size, &rand_x, 1, config, bx.get());
+      polynomial_eval(denominator.get(), denominator_size, &rand_x, 1, config, bx.get());
       polynomial_eval(q.get(), q_size, &rand_x, 1, config, qx.get());
       polynomial_eval(r.get(), r_size, &rand_x, 1, config, rx.get());
 
-      for (int i = 0; i < batch_size; ++i) {
+      for (int i = 0; i < config.batch_size; ++i) {
         // ICICLE_LOG_DEBUG << "ax=" << ax[i] << ", bx=" << bx[i] << ", qx=" << qx[i] << ", rx=" << rx[i];
         ASSERT_EQ(ax[i], qx[i] * bx[i] + rx[i]);
       }

From b361b0fd738bc78d0da53c05752284bdc5a4fcf3 Mon Sep 17 00:00:00 2001
From: Yuval Shekel <yshekel@gmail.com>
Date: Mon, 4 Nov 2024 18:07:51 +0200
Subject: [PATCH 34/43] vector div for extension field and test fix for missing
 ext field apis

---
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp  |   1 +
 .../include/icicle/backend/vec_ops_backend.h  |  13 +-
 icicle/src/vec_ops.cpp                        |  27 ++-
 icicle/tests/test_field_api.cpp               | 166 ++++++++----------
 4 files changed, 113 insertions(+), 94 deletions(-)

diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index b22c1ade2..913793ef5 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -496,6 +496,7 @@ REGISTER_VECTOR_ADD_EXT_FIELD_BACKEND("CPU", cpu_vector_add<extension_t>);
 REGISTER_VECTOR_ACCUMULATE_EXT_FIELD_BACKEND("CPU", cpu_vector_accumulate<extension_t>);
 REGISTER_VECTOR_SUB_EXT_FIELD_BACKEND("CPU", cpu_vector_sub<extension_t>);
 REGISTER_VECTOR_MUL_EXT_FIELD_BACKEND("CPU", cpu_vector_mul<extension_t>);
+REGISTER_VECTOR_DIV_EXT_FIELD_BACKEND("CPU", cpu_vector_div<extension_t>);
 REGISTER_CONVERT_MONTGOMERY_EXT_FIELD_BACKEND("CPU", cpu_convert_montgomery<extension_t>);
 #endif // EXT_FIELD
 
diff --git a/icicle/include/icicle/backend/vec_ops_backend.h b/icicle/include/icicle/backend/vec_ops_backend.h
index 69b64c893..36b41760e 100644
--- a/icicle/include/icicle/backend/vec_ops_backend.h
+++ b/icicle/include/icicle/backend/vec_ops_backend.h
@@ -302,6 +302,16 @@ namespace icicle {
       }();                                                                                                             \
     }
 
+  void register_extension_vector_div(const std::string& deviceType, extFieldVectorOpImpl impl);
+
+  #define REGISTER_VECTOR_DIV_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC)                                                     \
+    namespace {                                                                                                        \
+      static bool UNIQUE(_reg_vec_div_ext_field) = []() -> bool {                                                      \
+        register_extension_vector_div(DEVICE_TYPE, FUNC);                                                              \
+        return true;                                                                                                   \
+      }();                                                                                                             \
+    }
+
   using extFieldConvertMontgomeryImpl = std::function<eIcicleError(
     const Device& device,
     const extension_t* input,
@@ -356,7 +366,8 @@ namespace icicle {
     const extension_t* input,
     uint64_t offset,
     uint64_t stride,
-    uint64_t size,
+    uint64_t size_in,
+    uint64_t size_out,
     const VecOpsConfig& config,
     extension_t* output)>;
 
diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp
index ebb86e0c1..5c56facf8 100644
--- a/icicle/src/vec_ops.cpp
+++ b/icicle/src/vec_ops.cpp
@@ -180,6 +180,23 @@ namespace icicle {
     return CONCAT_EXPAND(FIELD, vector_div)(vec_a, vec_b, size, &config, output);
   }
 
+#ifdef EXT_FIELD
+  ICICLE_DISPATCHER_INST(VectorDivExtFieldDispatcher, extension_vector_div, extFieldVectorOpImpl);
+
+  extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_div)(
+    const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config, extension_t* output)
+  {
+    return VectorDivExtFieldDispatcher::execute(vec_a, vec_b, size, *config, output);
+  }
+
+  template <>
+  eIcicleError vector_div(
+    const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config, extension_t* output)
+  {
+    return CONCAT_EXPAND(FIELD, extension_vector_div)(vec_a, vec_b, size, &config, output);
+  }
+#endif // EXT_FIELD
+
   /*********************************** (Scalar + Vector) ELEMENT WISE ***********************************/
   ICICLE_DISPATCHER_INST(ScalarAddDispatcher, scalar_add_vec, scalarVectorOpImpl);
 
@@ -349,11 +366,12 @@ namespace icicle {
     const extension_t* input,
     uint64_t offset,
     uint64_t stride,
-    uint64_t size,
+    uint64_t size_in,
+    uint64_t size_out,
     const VecOpsConfig* config,
     extension_t* output)
   {
-    return ExtFieldSliceDispatcher::execute(input, offset, stride, size, *config, output);
+    return ExtFieldSliceDispatcher::execute(input, offset, stride, size_in, size_out, *config, output);
   }
 
   template <>
@@ -361,11 +379,12 @@ namespace icicle {
     const extension_t* input,
     uint64_t offset,
     uint64_t stride,
-    uint64_t size,
+    uint64_t size_in,
+    uint64_t size_out,
     const VecOpsConfig& config,
     extension_t* output)
   {
-    return CONCAT_EXPAND(FIELD, extension_slice)(input, offset, stride, size, &config, output);
+    return CONCAT_EXPAND(FIELD, extension_slice)(input, offset, stride, size_in, size_out, &config, output);
   }
 #endif // EXT_FIELD
 
diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index 6ae5a414c..67f7107d4 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -33,8 +33,7 @@ static inline std::string s_reference_target;
 static inline std::vector<std::string> s_registered_devices;
 bool s_is_cuda_registered; // TODO Yuval remove this
 
-template <typename T>
-class FieldApiTest : public ::testing::Test
+class FieldApiTestBase : public ::testing::Test
 {
 public:
   // SetUpTestSuite/TearDownTestSuite are called once for the entire test suite
@@ -60,7 +59,12 @@ class FieldApiTest : public ::testing::Test
   // SetUp/TearDown are called before and after each test
   void SetUp() override {}
   void TearDown() override {}
+};
 
+template <typename T>
+class FieldApiTest : public FieldApiTestBase
+{
+public:
   void random_samples(T* arr, uint64_t count)
   {
     for (uint64_t i = 0; i < count; i++)
@@ -183,9 +187,9 @@ TYPED_TEST(FieldApiTest, vectorVectorOps)
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_mul<TypeParam>, "vector mul", ITERS);
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 
-  // // div
-  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
-  FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
+  // div
+  TypeParam::rand_host_many(in_a.get(), total_size);
+  TypeParam::rand_host_many(in_b.get(), total_size);
   // reference
   if (!s_is_cuda_registered) {
     for (int i = 0; i < total_size; i++) {
@@ -253,7 +257,7 @@ TYPED_TEST(FieldApiTest, montgomeryConversion)
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 }
 
-TYPED_TEST(FieldApiTest, VectorReduceOps)
+TEST_F(FieldApiTestBase, VectorReduceOps)
 {
   int seed = time(0);
   srand(seed);
@@ -267,17 +271,17 @@ TYPED_TEST(FieldApiTest, VectorReduceOps)
   ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
   ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
 
-  auto in_a = std::make_unique<TypeParam[]>(total_size);
-  auto out_main = std::make_unique<TypeParam[]>(batch_size);
-  auto out_ref = std::make_unique<TypeParam[]>(batch_size);
+  auto in_a = std::make_unique<scalar_t[]>(total_size);
+  auto out_main = std::make_unique<scalar_t[]>(batch_size);
+  auto out_ref = std::make_unique<scalar_t[]>(batch_size);
 
   auto vector_accumulate_wrapper =
-    [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) {
+    [](scalar_t* a, const scalar_t* b, uint64_t size, const VecOpsConfig& config, scalar_t* /*out*/) {
       return vector_accumulate(a, b, size, config);
     };
 
   auto run =
-    [&](const std::string& dev_type, TypeParam* out, bool measure, auto vec_op_func, const char* msg, int iters) {
+    [&](const std::string& dev_type, scalar_t* out, bool measure, auto vec_op_func, const char* msg, int iters) {
       Device dev = {dev_type, 0};
       icicle_set_device(dev);
       auto config = default_vec_ops_config();
@@ -295,10 +299,10 @@ TYPED_TEST(FieldApiTest, VectorReduceOps)
     };
 
   // sum
-  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
+  scalar_t::rand_host_many(in_a.get(), total_size);
   // reference
   for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
-    out_ref[idx_in_batch] = TypeParam::from(0);
+    out_ref[idx_in_batch] = scalar_t::from(0);
   }
   if (!s_is_cuda_registered) {
     for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
@@ -308,16 +312,16 @@ TYPED_TEST(FieldApiTest, VectorReduceOps)
       }
     }
   } else {
-    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sum<TypeParam>, "vector sum", ITERS);
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sum<scalar_t>, "vector sum", ITERS);
   }
-  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sum<TypeParam>, "vector sum", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(TypeParam)));
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sum<scalar_t>, "vector sum", ITERS);
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(scalar_t)));
 
   // product
-  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
+  scalar_t::rand_host_many(in_a.get(), total_size);
   if (!s_is_cuda_registered) {
     for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
-      out_ref[idx_in_batch] = TypeParam::from(1);
+      out_ref[idx_in_batch] = scalar_t::from(1);
     }
     for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
       for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) {
@@ -326,13 +330,13 @@ TYPED_TEST(FieldApiTest, VectorReduceOps)
       }
     }
   } else {
-    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_product<TypeParam>, "vector product", ITERS);
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_product<scalar_t>, "vector product", ITERS);
   }
-  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_product<TypeParam>, "vector product", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(TypeParam)));
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_product<scalar_t>, "vector product", ITERS);
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(scalar_t)));
 }
 
-TYPED_TEST(FieldApiTest, scalarVectorOps)
+TEST_F(FieldApiTestBase, scalarVectorOps)
 {
   int seed = time(0);
   srand(seed);
@@ -346,21 +350,21 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
   ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
 
   const int total_size = N * batch_size;
-  auto scalar_a = std::make_unique<TypeParam[]>(batch_size);
-  auto in_b = std::make_unique<TypeParam[]>(total_size);
-  auto out_main = std::make_unique<TypeParam[]>(total_size);
-  auto out_ref = std::make_unique<TypeParam[]>(total_size);
+  auto scalar_a = std::make_unique<scalar_t[]>(batch_size);
+  auto in_b = std::make_unique<scalar_t[]>(total_size);
+  auto out_main = std::make_unique<scalar_t[]>(total_size);
+  auto out_ref = std::make_unique<scalar_t[]>(total_size);
   ICICLE_LOG_DEBUG << "N = " << N;
   ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
   ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
 
   auto vector_accumulate_wrapper =
-    [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) {
+    [](scalar_t* a, const scalar_t* b, uint64_t size, const VecOpsConfig& config, scalar_t* /*out*/) {
       return vector_accumulate(a, b, size, config);
     };
 
   auto run =
-    [&](const std::string& dev_type, TypeParam* out, bool measure, auto vec_op_func, const char* msg, int iters) {
+    [&](const std::string& dev_type, scalar_t* out, bool measure, auto vec_op_func, const char* msg, int iters) {
       Device dev = {dev_type, 0};
       icicle_set_device(dev);
       auto config = default_vec_ops_config();
@@ -378,8 +382,8 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
     };
 
   // scalar add vec
-  FieldApiTest<TypeParam>::random_samples(scalar_a.get(), batch_size);
-  FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
+  scalar_t::rand_host_many(scalar_a.get(), batch_size);
+  scalar_t::rand_host_many(in_b.get(), total_size);
 
   // reference
   if (!s_is_cuda_registered) {
@@ -390,15 +394,15 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
       }
     }
   } else {
-    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_add_vec<TypeParam>, "scalar add vec", ITERS);
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_add_vec<scalar_t>, "scalar add vec", ITERS);
   }
-  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_add_vec<TypeParam>, "scalar add vec", ITERS);
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_add_vec<scalar_t>, "scalar add vec", ITERS);
 
-  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t)));
 
   // scalar sub vec
-  FieldApiTest<TypeParam>::random_samples(scalar_a.get(), batch_size);
-  FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
+  scalar_t::rand_host_many(scalar_a.get(), batch_size);
+  scalar_t::rand_host_many(in_b.get(), total_size);
 
   if (!s_is_cuda_registered) {
     for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
@@ -408,15 +412,15 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
       }
     }
   } else {
-    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_sub_vec<TypeParam>, "scalar sub vec", ITERS);
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_sub_vec<scalar_t>, "scalar sub vec", ITERS);
   }
 
-  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_sub_vec<TypeParam>, "scalar sub vec", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_sub_vec<scalar_t>, "scalar sub vec", ITERS);
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t)));
 
   // scalar mul vec
-  FieldApiTest<TypeParam>::random_samples(scalar_a.get(), batch_size);
-  FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
+  scalar_t::rand_host_many(scalar_a.get(), batch_size);
+  scalar_t::rand_host_many(in_b.get(), total_size);
 
   if (!s_is_cuda_registered) {
     for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
@@ -426,10 +430,10 @@ TYPED_TEST(FieldApiTest, scalarVectorOps)
       }
     }
   } else {
-    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_mul_vec<TypeParam>, "scalar mul vec", ITERS);
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_mul_vec<scalar_t>, "scalar mul vec", ITERS);
   }
-  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_mul_vec<TypeParam>, "scalar mul vec", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_mul_vec<scalar_t>, "scalar mul vec", ITERS);
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t)));
 }
 
 TYPED_TEST(FieldApiTest, matrixAPIsAsync)
@@ -519,7 +523,7 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
   // }
 
   // Option 3: Initialize the entire input array with random values
-  FieldApiTest<TypeParam>::random_samples(h_inout.get(), total_size);
+  TypeParam::rand_host_many(h_inout.get(), total_size);
 
   // Reference implementation
   if (!s_is_cuda_registered) {
@@ -666,6 +670,8 @@ TYPED_TEST(FieldApiTest, Slice)
   auto out_main = std::make_unique<TypeParam[]>(total_size_out);
   auto out_ref = std::make_unique<TypeParam[]>(total_size_out);
 
+  TypeParam::rand_host_many(in_a.get(), total_size_in);
+
   auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) {
     Device dev = {dev_type, 0};
     icicle_set_device(dev);
@@ -683,25 +689,6 @@ TYPED_TEST(FieldApiTest, Slice)
     END_TIMER(SLICE, oss.str().c_str(), measure);
   };
 
-  // // Option 1: Initialize each input vector in the batch with the same ascending values
-  // for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
-  //   for (uint32_t i = 0; i < size_in; i++) {
-  //     if(columns_batch){
-  //       in_a[idx_in_batch + batch_size * i] = TypeParam::from(i);
-  //     } else {
-  //       in_a[idx_in_batch * size_in + i] = TypeParam::from(i);
-  //     }
-  //   }
-  // }
-
-  // // Option 2: Initialize the entire input array with ascending values
-  // for (int i = 0; i < total_size_in; i++) {
-  //   in_a[i] = TypeParam::from(i);
-  // }
-
-  // Option 3: Initialize the entire input array with random values
-  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size_in);
-
   // Reference implementation
   if (!s_is_cuda_registered) {
     for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
@@ -721,7 +708,7 @@ TYPED_TEST(FieldApiTest, Slice)
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size_out * sizeof(TypeParam)));
 }
 
-TYPED_TEST(FieldApiTest, highestNonZeroIdx)
+TEST_F(FieldApiTestBase, highestNonZeroIdx)
 {
   int seed = time(0);
   srand(seed);
@@ -731,7 +718,7 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx)
   const bool columns_batch = rand() % 2;
   const int total_size = N * batch_size;
 
-  auto in_a = std::make_unique<TypeParam[]>(total_size);
+  auto in_a = std::make_unique<scalar_t[]>(total_size);
   for (int i = 0; i < batch_size; ++i) {
     // randomize different rows with zeros in the end
     auto size = std::max(int64_t(N) / 4 - i, int64_t(1));
@@ -762,7 +749,7 @@ TYPED_TEST(FieldApiTest, highestNonZeroIdx)
   ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(int64_t)));
 }
 
-TYPED_TEST(FieldApiTest, polynomialEval)
+TEST_F(FieldApiTestBase, polynomialEval)
 {
   int seed = time(0);
   srand(seed);
@@ -780,12 +767,12 @@ TYPED_TEST(FieldApiTest, polynomialEval)
   const int total_coeffs_size = coeffs_size * batch_size;
   const int total_result_size = domain_size * batch_size;
 
-  auto in_coeffs = std::make_unique<TypeParam[]>(total_coeffs_size);
-  auto in_domain = std::make_unique<TypeParam[]>(domain_size);
-  auto out_main = std::make_unique<TypeParam[]>(total_result_size);
-  auto out_ref = std::make_unique<TypeParam[]>(total_result_size);
+  auto in_coeffs = std::make_unique<scalar_t[]>(total_coeffs_size);
+  auto in_domain = std::make_unique<scalar_t[]>(domain_size);
+  auto out_main = std::make_unique<scalar_t[]>(total_result_size);
+  auto out_ref = std::make_unique<scalar_t[]>(total_result_size);
 
-  auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) {
+  auto run = [&](const std::string& dev_type, scalar_t* out, bool measure, const char* msg, int iters) {
     Device dev = {dev_type, 0};
     icicle_set_device(dev);
     auto config = default_vec_ops_config();
@@ -802,15 +789,15 @@ TYPED_TEST(FieldApiTest, polynomialEval)
     END_TIMER(polynomialEval, oss.str().c_str(), measure);
   };
 
-  FieldApiTest<TypeParam>::random_samples(in_coeffs.get(), total_coeffs_size);
-  FieldApiTest<TypeParam>::random_samples(in_domain.get(), domain_size);
+  scalar_t::rand_host_many(in_coeffs.get(), total_coeffs_size);
+  scalar_t::rand_host_many(in_domain.get(), domain_size);
 
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "polynomial_eval", 1);
   run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "polynomial_eval", 1);
-  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_result_size * sizeof(TypeParam)));
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_result_size * sizeof(scalar_t)));
 }
 
-TYPED_TEST(FieldApiTest, polynomialDivision)
+TEST_F(FieldApiTestBase, polynomialDivision)
 {
   const uint64_t numerator_size = 1 << 4;
   const uint64_t denominator_size = 1 << 3;
@@ -821,17 +808,17 @@ TYPED_TEST(FieldApiTest, polynomialDivision)
   // basically we compute q(x),r(x) for a(x)=q(x)b(x)+r(x) by dividing a(x)/b(x)
 
   // randomize matrix with rows/cols as polynomials
-  auto numerator = std::make_unique<TypeParam[]>(numerator_size * batch_size);
-  auto denominator = std::make_unique<TypeParam[]>(denominator_size * batch_size);
-  TypeParam::rand_host_many(numerator.get(), numerator_size * batch_size);
-  TypeParam::rand_host_many(denominator.get(), denominator_size * batch_size);
+  auto numerator = std::make_unique<scalar_t[]>(numerator_size * batch_size);
+  auto denominator = std::make_unique<scalar_t[]>(denominator_size * batch_size);
+  scalar_t::rand_host_many(numerator.get(), numerator_size * batch_size);
+  scalar_t::rand_host_many(denominator.get(), denominator_size * batch_size);
 
   // Add padding to each row so that the degree is lower than the size
   const int zero_pad_length = 5;
   for (int i = 0; i < batch_size; ++i) {
     for (int j = 0; j < zero_pad_length; ++j) {
-      numerator[i * numerator_size + numerator_size - zero_pad_length + j] = TypeParam::zero();
-      denominator[i * denominator_size + denominator_size - zero_pad_length + j] = TypeParam::zero();
+      numerator[i * numerator_size + numerator_size - zero_pad_length + j] = scalar_t::zero();
+      denominator[i * denominator_size + denominator_size - zero_pad_length + j] = scalar_t::zero();
     }
   }
 
@@ -840,8 +827,8 @@ TYPED_TEST(FieldApiTest, polynomialDivision)
     for (int columns_batch = 0; columns_batch <= 1; columns_batch++) {
       ICICLE_LOG_DEBUG << "testing polynomial division on device " << device << " [column_batch=" << columns_batch
                        << "]";
-      auto q = std::make_unique<TypeParam[]>(q_size * batch_size);
-      auto r = std::make_unique<TypeParam[]>(r_size * batch_size);
+      auto q = std::make_unique<scalar_t[]>(q_size * batch_size);
+      auto r = std::make_unique<scalar_t[]>(r_size * batch_size);
 
       auto config = default_vec_ops_config();
       config.batch_size = columns_batch ? batch_size - zero_pad_length : batch_size; // skip the zero cols
@@ -857,11 +844,11 @@ TYPED_TEST(FieldApiTest, polynomialDivision)
         r_size));
 
       // test a(x)=q(x)b(x)+r(x) in random point
-      const auto rand_x = TypeParam::rand_host();
-      auto ax = std::make_unique<TypeParam[]>(config.batch_size);
-      auto bx = std::make_unique<TypeParam[]>(config.batch_size);
-      auto qx = std::make_unique<TypeParam[]>(config.batch_size);
-      auto rx = std::make_unique<TypeParam[]>(config.batch_size);
+      const auto rand_x = scalar_t::rand_host();
+      auto ax = std::make_unique<scalar_t[]>(config.batch_size);
+      auto bx = std::make_unique<scalar_t[]>(config.batch_size);
+      auto qx = std::make_unique<scalar_t[]>(config.batch_size);
+      auto rx = std::make_unique<scalar_t[]>(config.batch_size);
       polynomial_eval(numerator.get(), numerator_size, &rand_x, 1, config, ax.get());
       polynomial_eval(denominator.get(), denominator_size, &rand_x, 1, config, bx.get());
       polynomial_eval(q.get(), q_size, &rand_x, 1, config, qx.get());
@@ -916,7 +903,8 @@ TYPED_TEST(FieldApiTest, ntt)
 
   const int total_size = N * batch_size;
   auto scalars = std::make_unique<TypeParam[]>(total_size);
-  FieldApiTest<TypeParam>::random_samples(scalars.get(), total_size);
+  TypeParam::rand_host_many(scalars.get(), total_size);
+
   auto out_main = std::make_unique<TypeParam[]>(total_size);
   auto out_ref = std::make_unique<TypeParam[]>(total_size);
   auto run = [&](const std::string& dev_type, TypeParam* out, const char* msg, bool measure, int iters) {

From fd208f4af7f3601de7c756bf55ee3e53dbd5849d Mon Sep 17 00:00:00 2001
From: Yuval Shekel <yshekel@gmail.com>
Date: Mon, 4 Nov 2024 18:08:08 +0200
Subject: [PATCH 35/43] remove wrong file

---
 icicle_v3/include/icicle/mmcs.h | 30 ------------------------------
 1 file changed, 30 deletions(-)
 delete mode 100644 icicle_v3/include/icicle/mmcs.h

diff --git a/icicle_v3/include/icicle/mmcs.h b/icicle_v3/include/icicle/mmcs.h
deleted file mode 100644
index 94394b822..000000000
--- a/icicle_v3/include/icicle/mmcs.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#pragma once
-
-#include "errors.h"
-#include "runtime.h"
-#include "hash.h"
-#include "merkle_tree.h"
-#include "icicle/utils/utils.h"
-
-#include <cstdint>
-#include <functional>
-
-
-template <typename T>
-  struct Matrix {
-    T* values;
-    size_t width;
-    size_t height;
-  };
-
-eIcicleError build_mmcs_tree(const Matrix<limb_t>* inputs,
-    const unsigned int number_of_inputs,
-    limb_t** outputs,
-    const Hash& hash,
-    const Hash& compression,
-    const MerkleTreeConfig& config);
-    
-    //create hash <-hasher,compressor
-
-    //sort, and call merkle tree
-    //how to return outputs?
\ No newline at end of file

From 4de758f003bc0854e1c45c49eae03a71c8b1e0d3 Mon Sep 17 00:00:00 2001
From: Yuval Shekel <yshekel@gmail.com>
Date: Mon, 4 Nov 2024 18:10:24 +0200
Subject: [PATCH 36/43] revert api headers

---
 icicle/include/icicle/api/bls12_377.h | 39 ---------------------------
 icicle/include/icicle/api/bls12_381.h | 39 ---------------------------
 icicle/include/icicle/api/grumpkin.h  | 10 -------
 3 files changed, 88 deletions(-)

diff --git a/icicle/include/icicle/api/bls12_377.h b/icicle/include/icicle/api/bls12_377.h
index 9cd0e9d66..972bd59e2 100644
--- a/icicle/include/icicle/api/bls12_377.h
+++ b/icicle/include/icicle/api/bls12_377.h
@@ -10,45 +10,6 @@
 #include "icicle/msm.h"
 #include "icicle/vec_ops.h"
 
-extern "C" bool bls12_377_g2_eq(bls12_377::g2_projective_t* point1, bls12_377::g2_projective_t* point2);
-
-extern "C" void bls12_377_g2_to_affine(bls12_377::g2_projective_t* point, bls12_377::g2_affine_t* point_out);
-
-extern "C" void bls12_377_g2_generate_projective_points(bls12_377::g2_projective_t* points, int size);
-
-extern "C" void bls12_377_g2_generate_affine_points(bls12_377::g2_affine_t* points, int size);
-
-extern "C" eIcicleError bls12_377_g2_affine_convert_montgomery(
-  const bls12_377::g2_affine_t* input,
-  size_t n,
-  bool is_into,
-  const VecOpsConfig* config,
-  bls12_377::g2_affine_t* output);
-
-extern "C" eIcicleError bls12_377_g2_projective_convert_montgomery(
-  const bls12_377::g2_projective_t* input,
-  size_t n,
-  bool is_into,
-  const VecOpsConfig* config,
-  bls12_377::g2_projective_t* output);
-
-extern "C" eIcicleError bls12_377_ecntt(
-  const bls12_377::projective_t* input,
-  int size,
-  NTTDir dir,
-  const NTTConfig<bls12_377::scalar_t>* config,
-  bls12_377::projective_t* output);
-
-extern "C" eIcicleError bls12_377_precompute_msm_bases(
-  const bls12_377::affine_t* bases, int nof_bases, const MSMConfig* config, bls12_377::affine_t* output_bases);
-
-extern "C" eIcicleError bls12_377_msm(
-  const bls12_377::scalar_t* scalars,
-  const bls12_377::affine_t* points,
-  int msm_size,
-  const MSMConfig* config,
-  bls12_377::projective_t* out);
-
 extern "C" bool bls12_377_eq(bls12_377::projective_t* point1, bls12_377::projective_t* point2);
 
 extern "C" void bls12_377_to_affine(bls12_377::projective_t* point, bls12_377::affine_t* point_out);
diff --git a/icicle/include/icicle/api/bls12_381.h b/icicle/include/icicle/api/bls12_381.h
index 01165b2d6..03e3bdd36 100644
--- a/icicle/include/icicle/api/bls12_381.h
+++ b/icicle/include/icicle/api/bls12_381.h
@@ -10,45 +10,6 @@
 #include "icicle/msm.h"
 #include "icicle/vec_ops.h"
 
-extern "C" bool bls12_381_g2_eq(bls12_381::g2_projective_t* point1, bls12_381::g2_projective_t* point2);
-
-extern "C" void bls12_381_g2_to_affine(bls12_381::g2_projective_t* point, bls12_381::g2_affine_t* point_out);
-
-extern "C" void bls12_381_g2_generate_projective_points(bls12_381::g2_projective_t* points, int size);
-
-extern "C" void bls12_381_g2_generate_affine_points(bls12_381::g2_affine_t* points, int size);
-
-extern "C" eIcicleError bls12_381_g2_affine_convert_montgomery(
-  const bls12_381::g2_affine_t* input,
-  size_t n,
-  bool is_into,
-  const VecOpsConfig* config,
-  bls12_381::g2_affine_t* output);
-
-extern "C" eIcicleError bls12_381_g2_projective_convert_montgomery(
-  const bls12_381::g2_projective_t* input,
-  size_t n,
-  bool is_into,
-  const VecOpsConfig* config,
-  bls12_381::g2_projective_t* output);
-
-extern "C" eIcicleError bls12_381_ecntt(
-  const bls12_381::projective_t* input,
-  int size,
-  NTTDir dir,
-  const NTTConfig<bls12_381::scalar_t>* config,
-  bls12_381::projective_t* output);
-
-extern "C" eIcicleError bls12_381_precompute_msm_bases(
-  const bls12_381::affine_t* bases, int nof_bases, const MSMConfig* config, bls12_381::affine_t* output_bases);
-
-extern "C" eIcicleError bls12_381_msm(
-  const bls12_381::scalar_t* scalars,
-  const bls12_381::affine_t* points,
-  int msm_size,
-  const MSMConfig* config,
-  bls12_381::projective_t* out);
-
 extern "C" bool bls12_381_eq(bls12_381::projective_t* point1, bls12_381::projective_t* point2);
 
 extern "C" void bls12_381_to_affine(bls12_381::projective_t* point, bls12_381::affine_t* point_out);
diff --git a/icicle/include/icicle/api/grumpkin.h b/icicle/include/icicle/api/grumpkin.h
index 3acdfa5c1..235b72843 100644
--- a/icicle/include/icicle/api/grumpkin.h
+++ b/icicle/include/icicle/api/grumpkin.h
@@ -9,16 +9,6 @@
 #include "icicle/msm.h"
 #include "icicle/vec_ops.h"
 
-extern "C" eIcicleError grumpkin_precompute_msm_bases(
-  const grumpkin::affine_t* bases, int nof_bases, const MSMConfig* config, grumpkin::affine_t* output_bases);
-
-extern "C" eIcicleError grumpkin_msm(
-  const grumpkin::scalar_t* scalars,
-  const grumpkin::affine_t* points,
-  int msm_size,
-  const MSMConfig* config,
-  grumpkin::projective_t* out);
-
 extern "C" bool grumpkin_eq(grumpkin::projective_t* point1, grumpkin::projective_t* point2);
 
 extern "C" void grumpkin_to_affine(grumpkin::projective_t* point, grumpkin::affine_t* point_out);

From c9788e9fcc50384dd9b7547f6aede10fe6554c21 Mon Sep 17 00:00:00 2001
From: Yuval Shekel <yshekel@gmail.com>
Date: Mon, 4 Nov 2024 18:16:36 +0200
Subject: [PATCH 37/43] minor cleanup

---
 .../icicle/polynomials/default_backend/default_poly_backend.h   | 2 +-
 icicle/tests/test_field_api.cpp                                 | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
index bfa57f9c3..ef59f816f 100644
--- a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
+++ b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
@@ -65,7 +65,7 @@ namespace icicle {
       config.is_async = true;
       config.stream = m_stream;
 
-      ICICLE_CHECK(icicle::slice(in_coeffs, offset, stride, size, out_size, config, out_coeffs));
+      ICICLE_CHECK(icicle::slice(in_coeffs, offset, stride, in_size, out_size, config, out_coeffs));
     }
 
     void add_sub(PolyContext& res, PolyContext a, PolyContext b, bool add1_sub0)
diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index 67f7107d4..703018797 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -2,9 +2,7 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include "dlfcn.h"
-#include <new>
 #include <random>
-#include <cstdlib> // For system
 
 #include "icicle/runtime.h"
 #include "icicle/vec_ops.h"

From fdc7a5c428db9cff13b3750e1ad5ebc8937ae599 Mon Sep 17 00:00:00 2001
From: Yuval Shekel <yshekel@gmail.com>
Date: Mon, 4 Nov 2024 19:14:30 +0200
Subject: [PATCH 38/43] update go vec-ops config struct

---
 wrappers/golang/core/vec_ops.go | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/wrappers/golang/core/vec_ops.go b/wrappers/golang/core/vec_ops.go
index 08b87ef08..3671f0653 100644
--- a/wrappers/golang/core/vec_ops.go
+++ b/wrappers/golang/core/vec_ops.go
@@ -29,7 +29,15 @@ type VecOpsConfig struct {
 	/// non-blocking and you'll need to synchronize it explicitly by calling
 	/// `SynchronizeStream`. If set to false, the function will block the current CPU thread.
 	IsAsync bool
-	Ext     config_extension.ConfigExtensionHandler
+	/// Number of vectors (or operations) to process in a batch.
+	/// Each vector operation will be performed independently on each batch element.
+	/// Default value: 1.
+	BatchSize int32
+	/// True if the batched vectors are stored as columns in a 2D array (i.e., the vectors are
+	/// strided in memory as columns of a matrix). If false, the batched vectors are stored
+	/// contiguously in memory (e.g., as rows or in a flat array). Default value: false.
+	ColumnsBatch bool
+	Ext          config_extension.ConfigExtensionHandler
 }
 
 /**
@@ -43,6 +51,8 @@ func DefaultVecOpsConfig() VecOpsConfig {
 		false, // isBOnDevice
 		false, // isResultOnDevice
 		false, // IsAsync
+		1,     // BatchSize
+		false, // ColumnsBatch
 		nil,   // Ext
 	}
 

From 198d196d873b073b81d4c344a1cb1399a518d759 Mon Sep 17 00:00:00 2001
From: Yuval Shekel <yshekel@gmail.com>
Date: Mon, 4 Nov 2024 19:42:48 +0200
Subject: [PATCH 39/43] fix C++ example

---
 .../c++/polynomial-multiplication/example.cpp   | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/examples/c++/polynomial-multiplication/example.cpp b/examples/c++/polynomial-multiplication/example.cpp
index 9bd90b842..1fdfeb501 100644
--- a/examples/c++/polynomial-multiplication/example.cpp
+++ b/examples/c++/polynomial-multiplication/example.cpp
@@ -69,21 +69,18 @@ int main(int argc, char** argv)
     ICICLE_CHECK(bn254_ntt(polyB.get(), NTT_SIZE, NTTDir::kForward, &ntt_config, d_polyB));
 
     // (4) multiply A,B
-    VecOpsConfig config{
-      nullptr,
-      true,   // is_a_on_device
-      true,   // is_b_on_device
-      true,   // is_result_on_device
-      false,  // is_async
-      nullptr // ext
-    };
-    ICICLE_CHECK(bn254_vector_mul(d_polyA, d_polyB, NTT_SIZE, &config, d_polyRes));
+    VecOpsConfig config = default_vec_ops_config();
+    config.is_a_on_device = true;
+    config.is_b_on_device = true;
+    config.is_result_on_device = true;
+
+    ICICLE_CHECK(vector_mul(d_polyA, d_polyB, NTT_SIZE, config, d_polyRes));
 
     // (5) INTT (in place)
     ntt_config.are_inputs_on_device = true;
     ntt_config.are_outputs_on_device = true;
     ntt_config.ordering = Ordering::kMN;
-    ICICLE_CHECK(bn254_ntt(d_polyRes, NTT_SIZE, NTTDir::kInverse, &ntt_config, d_polyRes));
+    ICICLE_CHECK(ntt(d_polyRes, NTT_SIZE, NTTDir::kInverse, ntt_config, d_polyRes));
 
     if (print) { END_TIMER(poly_multiply, "polynomial multiplication took"); }
 

From 8f827d6727b056d469b34aab90dabd4de30070cf Mon Sep 17 00:00:00 2001
From: Emir Soyturk <emirsytrk@gmail.com>
Date: Mon, 4 Nov 2024 21:30:23 +0300
Subject: [PATCH 40/43] vec_ops rust binding and tests (#642)

---
 wrappers/rust/icicle-core/src/vec_ops/mod.rs  | 350 ++++++++++++++++++
 .../rust/icicle-core/src/vec_ops/tests.rs     | 234 +++++++++++-
 2 files changed, 583 insertions(+), 1 deletion(-)

diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
index ba22b776d..277846ee8 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
@@ -13,6 +13,8 @@ pub struct VecOpsConfig {
     pub is_b_on_device: bool,
     pub is_result_on_device: bool,
     pub is_async: bool,
+    pub batch_size: i32,
+    pub columns_batch: bool,
     pub ext: ConfigExtension,
 }
 
@@ -24,6 +26,8 @@ impl VecOpsConfig {
             is_b_on_device: false,
             is_result_on_device: false,
             is_async: false,
+            batch_size: 1,
+            columns_batch: false,
             ext: ConfigExtension::new(),
         }
     }
@@ -58,6 +62,46 @@ pub trait VecOps<F> {
         cfg: &VecOpsConfig,
     ) -> Result<(), eIcicleError>;
 
+    fn div(
+        a: &(impl HostOrDeviceSlice<F> + ?Sized),
+        b: &(impl HostOrDeviceSlice<F> + ?Sized),
+        result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+        cfg: &VecOpsConfig,
+    ) -> Result<(), eIcicleError>;
+
+    fn sum(
+        a: &(impl HostOrDeviceSlice<F> + ?Sized),
+        result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+        cfg: &VecOpsConfig,
+    ) -> Result<(), eIcicleError>;
+
+    fn product(
+        a: &(impl HostOrDeviceSlice<F> + ?Sized),
+        result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+        cfg: &VecOpsConfig,
+    ) -> Result<(), eIcicleError>;
+
+    fn scalar_add(
+        a: &(impl HostOrDeviceSlice<F> + ?Sized),
+        b: &(impl HostOrDeviceSlice<F> + ?Sized),
+        result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+        cfg: &VecOpsConfig,
+    ) -> Result<(), eIcicleError>;
+
+    fn scalar_sub(
+        a: &(impl HostOrDeviceSlice<F> + ?Sized),
+        b: &(impl HostOrDeviceSlice<F> + ?Sized),
+        result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+        cfg: &VecOpsConfig,
+    ) -> Result<(), eIcicleError>;
+
+    fn scalar_mul(
+        a: &(impl HostOrDeviceSlice<F> + ?Sized),
+        b: &(impl HostOrDeviceSlice<F> + ?Sized),
+        result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+        cfg: &VecOpsConfig,
+    ) -> Result<(), eIcicleError>;
+
     fn transpose(
         input: &(impl HostOrDeviceSlice<F> + ?Sized),
         nof_rows: u32,
@@ -76,6 +120,16 @@ pub trait VecOps<F> {
         input: &mut (impl HostOrDeviceSlice<F> + ?Sized),
         cfg: &VecOpsConfig,
     ) -> Result<(), eIcicleError>;
+
+    fn slice(
+        input: &(impl HostOrDeviceSlice<F> + ?Sized),
+        offset: u64,
+        stride: u64,
+        size_in: u64,
+        size_out: u64,
+        cfg: &VecOpsConfig,
+        output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+    ) -> Result<(), eIcicleError>;
 }
 
 fn check_vec_ops_args<'a, F>(
@@ -166,6 +220,88 @@ where
     <<F as FieldImpl>::Config as VecOps<F>>::mul(a, b, result, &cfg)
 }
 
+pub fn div_scalars<F>(
+    a: &(impl HostOrDeviceSlice<F> + ?Sized),
+    b: &(impl HostOrDeviceSlice<F> + ?Sized),
+    result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+    cfg: &VecOpsConfig,
+) -> Result<(), eIcicleError>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: VecOps<F>,
+{
+    let cfg = check_vec_ops_args(a, b, result, cfg);
+    <<F as FieldImpl>::Config as VecOps<F>>::div(a, b, result, &cfg)
+}
+
+pub fn sum_scalars<F>(
+    a: &(impl HostOrDeviceSlice<F> + ?Sized),
+    result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+    cfg: &VecOpsConfig,
+) -> Result<(), eIcicleError>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: VecOps<F>,
+{
+    let cfg = check_vec_ops_args(a, a, result, cfg); //TODO: emirsoyturk
+    <<F as FieldImpl>::Config as VecOps<F>>::sum(a, result, &cfg)
+}
+
+pub fn product_scalars<F>(
+    a: &(impl HostOrDeviceSlice<F> + ?Sized),
+    result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+    cfg: &VecOpsConfig,
+) -> Result<(), eIcicleError>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: VecOps<F>,
+{
+    let cfg = check_vec_ops_args(a, a, result, cfg); //TODO: emirsoyturk
+    <<F as FieldImpl>::Config as VecOps<F>>::product(a, result, &cfg)
+}
+
+pub fn scalar_add<F>(
+    a: &(impl HostOrDeviceSlice<F> + ?Sized),
+    b: &(impl HostOrDeviceSlice<F> + ?Sized),
+    result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+    cfg: &VecOpsConfig,
+) -> Result<(), eIcicleError>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: VecOps<F>,
+{
+    let cfg = check_vec_ops_args(b, b, result, cfg); //TODO: emirsoyturk
+    <<F as FieldImpl>::Config as VecOps<F>>::scalar_add(a, b, result, &cfg)
+}
+
+pub fn scalar_sub<F>(
+    a: &(impl HostOrDeviceSlice<F> + ?Sized),
+    b: &(impl HostOrDeviceSlice<F> + ?Sized),
+    result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+    cfg: &VecOpsConfig,
+) -> Result<(), eIcicleError>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: VecOps<F>,
+{
+    let cfg = check_vec_ops_args(b, b, result, cfg); //TODO: emirsoyturk
+    <<F as FieldImpl>::Config as VecOps<F>>::scalar_sub(a, b, result, &cfg)
+}
+
+pub fn scalar_mul<F>(
+    a: &(impl HostOrDeviceSlice<F> + ?Sized),
+    b: &(impl HostOrDeviceSlice<F> + ?Sized),
+    result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+    cfg: &VecOpsConfig,
+) -> Result<(), eIcicleError>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: VecOps<F>,
+{
+    let cfg = check_vec_ops_args(b, b, result, cfg); //TODO: emirsoyturk
+    <<F as FieldImpl>::Config as VecOps<F>>::scalar_mul(a, b, result, &cfg)
+}
+
 pub fn transpose_matrix<F>(
     input: &(impl HostOrDeviceSlice<F> + ?Sized),
     nof_rows: u32,
@@ -205,6 +341,23 @@ where
     <<F as FieldImpl>::Config as VecOps<F>>::bit_reverse_inplace(input, &cfg)
 }
 
+pub fn slice<F>(
+    input: &(impl HostOrDeviceSlice<F> + ?Sized),
+    offset: u64,
+    stride: u64,
+    size_in: u64,
+    size_out: u64,
+    cfg: &VecOpsConfig,
+    output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+) -> Result<(), eIcicleError>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: VecOps<F>,
+{
+    <<F as FieldImpl>::Config as VecOps<F>>::slice(input, offset, stride, size_in, size_out, &cfg, output)
+}
+
+
 #[macro_export]
 macro_rules! impl_vec_ops_field {
     (
@@ -255,6 +408,59 @@ macro_rules! impl_vec_ops_field {
                     result: *mut $field,
                 ) -> eIcicleError;
 
+                #[link_name = concat!($field_prefix, "_vector_div")]
+                pub(crate) fn vector_div_ffi(
+                    a: *const $field,
+                    b: *const $field,
+                    size: u32,
+                    cfg: *const VecOpsConfig,
+                    result: *mut $field,
+                ) -> eIcicleError;
+
+                #[link_name = concat!($field_prefix, "_vector_sum")]
+                pub(crate) fn vector_sum_ffi(
+                    a: *const $field,
+                    size: u32,
+                    cfg: *const VecOpsConfig,
+                    result: *mut $field,
+                ) -> eIcicleError;
+
+
+                #[link_name = concat!($field_prefix, "_vector_product")]
+                pub(crate) fn vector_product_ffi(
+                    a: *const $field,
+                    size: u32,
+                    cfg: *const VecOpsConfig,
+                    result: *mut $field,
+                ) -> eIcicleError;
+
+                #[link_name = concat!($field_prefix, "_scalar_add_vec")]
+                pub(crate) fn scalar_add_ffi(
+                    a: *const $field,
+                    b: *const $field,
+                    size: u32,
+                    cfg: *const VecOpsConfig,
+                    result: *mut $field,
+                ) -> eIcicleError;
+
+                #[link_name = concat!($field_prefix, "_scalar_sub_vec")]
+                pub(crate) fn scalar_sub_ffi(
+                    a: *const $field,
+                    b: *const $field,
+                    size: u32,
+                    cfg: *const VecOpsConfig,
+                    result: *mut $field,
+                ) -> eIcicleError;
+
+                #[link_name = concat!($field_prefix, "_scalar_mul_vec")]
+                pub(crate) fn scalar_mul_ffi(
+                    a: *const $field,
+                    b: *const $field,
+                    size: u32,
+                    cfg: *const VecOpsConfig,
+                    result: *mut $field,
+                ) -> eIcicleError;
+
                 #[link_name = concat!($field_prefix, "_matrix_transpose")]
                 pub(crate) fn matrix_transpose_ffi(
                     input: *const $field,
@@ -271,6 +477,17 @@ macro_rules! impl_vec_ops_field {
                     config: *const VecOpsConfig,
                     output: *mut $field,
                 ) -> eIcicleError;
+
+                #[link_name = concat!($field_prefix, "_slice")]
+                pub(crate) fn slice_ffi(
+                    input: *const $field,
+                    offset: u64,
+                    stride: u64,
+                    size_in: u64,
+                    size_out: u64,
+                    cfg: *const VecOpsConfig,
+                    output: *mut $field,
+                ) -> eIcicleError;
             }
         }
 
@@ -344,6 +561,110 @@ macro_rules! impl_vec_ops_field {
                     .wrap()
                 }
             }
+            
+            fn div(
+                a: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                b: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                result: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
+                cfg: &VecOpsConfig,
+            ) -> Result<(), eIcicleError> {
+                unsafe {
+                    $field_prefix_ident::vector_div_ffi(
+                        a.as_ptr(),
+                        b.as_ptr(),
+                        a.len() as u32,
+                        cfg as *const VecOpsConfig,
+                        result.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
+            }
+
+            fn sum(
+                a: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                result: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
+                cfg: &VecOpsConfig,
+            ) -> Result<(), eIcicleError> {
+                unsafe {
+                    $field_prefix_ident::vector_sum_ffi(
+                        a.as_ptr(),
+                        a.len() as u32,
+                        cfg as *const VecOpsConfig,
+                        result.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
+            }
+
+            fn product(
+                a: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                result: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
+                cfg: &VecOpsConfig,
+            ) -> Result<(), eIcicleError> {
+                unsafe {
+                    $field_prefix_ident::vector_sum_ffi(
+                        a.as_ptr(),
+                        a.len() as u32,
+                        cfg as *const VecOpsConfig,
+                        result.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
+            }
+
+            fn scalar_add(
+                a: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                b: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                result: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
+                cfg: &VecOpsConfig,
+            ) -> Result<(), eIcicleError> {
+                unsafe {
+                    $field_prefix_ident::scalar_add_ffi(
+                        a.as_ptr(),
+                        b.as_ptr(),
+                        b.len() as u32,
+                        cfg as *const VecOpsConfig,
+                        result.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
+            }
+
+            fn scalar_sub(
+                a: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                b: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                result: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
+                cfg: &VecOpsConfig,
+            ) -> Result<(), eIcicleError> {
+                unsafe {
+                    $field_prefix_ident::scalar_sub_ffi(
+                        a.as_ptr(),
+                        b.as_ptr(),
+                        b.len() as u32,
+                        cfg as *const VecOpsConfig,
+                        result.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
+            }
+
+            fn scalar_mul(
+                a: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                b: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                result: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
+                cfg: &VecOpsConfig,
+            ) -> Result<(), eIcicleError> {
+                unsafe {
+                    $field_prefix_ident::scalar_mul_ffi(
+                        a.as_ptr(),
+                        b.as_ptr(),
+                        b.len() as u32,
+                        cfg as *const VecOpsConfig,
+                        result.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
+            }
 
             fn transpose(
                 input: &(impl HostOrDeviceSlice<$field> + ?Sized),
@@ -394,6 +715,29 @@ macro_rules! impl_vec_ops_field {
                     .wrap()
                 }
             }
+
+            fn slice(
+                input: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                offset: u64,
+                stride: u64,
+                size_in: u64,
+                size_out: u64,
+                cfg: &VecOpsConfig,
+                output: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
+            ) -> Result<(), eIcicleError> {
+                unsafe {
+                    $field_prefix_ident::slice_ffi(
+                        input.as_ptr(),
+                        offset,
+                        stride,
+                        size_in,
+                        size_out,
+                        cfg as *const VecOpsConfig,
+                        output.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
+            }
         }
     };
 }
@@ -436,6 +780,12 @@ macro_rules! impl_vec_ops_tests {
                 initialize();
                 check_bit_reverse_inplace::<$field>()
             }
+
+            #[test]
+            pub fn test_slice() {
+                initialize();
+                check_slice::<$field>()
+            }
         }
     };
 }
diff --git a/wrappers/rust/icicle-core/src/vec_ops/tests.rs b/wrappers/rust/icicle-core/src/vec_ops/tests.rs
index 6762f06c9..4a16fcb21 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/tests.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/tests.rs
@@ -2,7 +2,7 @@
 use crate::test_utilities;
 use crate::traits::GenerateRandom;
 use crate::vec_ops::{
-    accumulate_scalars, add_scalars, bit_reverse, bit_reverse_inplace, mul_scalars, sub_scalars, transpose_matrix,
+    accumulate_scalars, add_scalars, bit_reverse, bit_reverse_inplace, mul_scalars, slice, div_scalars, sum_scalars, scalar_add, scalar_sub, scalar_mul, product_scalars, sub_scalars, transpose_matrix,
     FieldImpl, VecOps, VecOpsConfig,
 };
 use icicle_runtime::device::Device;
@@ -44,6 +44,12 @@ where
     check_vec_ops_scalars_add::<F>(test_size);
     check_vec_ops_scalars_sub::<F>(test_size);
     check_vec_ops_scalars_mul::<F>(test_size);
+    check_vec_ops_scalars_div::<F>(test_size);
+    check_vec_ops_scalars_sum::<F>(test_size);
+    check_vec_ops_scalars_product::<F>(test_size);
+    check_vec_ops_scalars_add_scalar::<F>(test_size);
+    check_vec_ops_scalars_sub_scalar::<F>(test_size);
+    check_vec_ops_scalars_mul_scalar::<F>(test_size);
     check_vec_ops_scalars_accumulate::<F>(test_size);
 }
 
@@ -140,6 +146,191 @@ where
         .unwrap();
 }
 
+pub fn check_vec_ops_scalars_div<F: FieldImpl>(test_size: usize)
+where
+    <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
+{
+    let a_main = F::Config::generate_random(test_size);
+    let b = F::Config::generate_random(test_size);
+    let mut result_main = vec![F::zero(); test_size];
+    let mut result_ref = vec![F::zero(); test_size];
+
+    let a_main = HostSlice::from_slice(&a_main);
+    let b = HostSlice::from_slice(&b);
+    let result_main = HostSlice::from_mut_slice(&mut result_main);
+    let result_ref = HostSlice::from_mut_slice(&mut result_ref);
+
+    let mut stream = IcicleStream::create().unwrap();
+    let mut cfg = VecOpsConfig::default();
+    cfg.stream_handle = *stream;
+
+    test_utilities::test_set_main_device();
+    div_scalars(a_main, b, result_main, &cfg).unwrap();
+
+    test_utilities::test_set_ref_device();
+    div_scalars(a_main, b, result_ref, &cfg).unwrap();
+
+    assert_eq!(result_main.as_slice(), result_ref.as_slice());
+
+    stream
+        .destroy()
+        .unwrap();
+}
+
+pub fn check_vec_ops_scalars_sum<F: FieldImpl>(test_size: usize)
+where
+    <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
+{
+    let a_main = F::Config::generate_random(test_size);
+    let mut result_main = vec![F::zero(); test_size];
+    let mut result_ref = vec![F::zero(); test_size];
+
+    let a_main = HostSlice::from_slice(&a_main);
+    let result_main = HostSlice::from_mut_slice(&mut result_main);
+    let result_ref = HostSlice::from_mut_slice(&mut result_ref);
+
+    let mut stream = IcicleStream::create().unwrap();
+    let mut cfg = VecOpsConfig::default();
+    cfg.stream_handle = *stream;
+
+    test_utilities::test_set_main_device();
+    sum_scalars(a_main, result_main, &cfg).unwrap();
+
+    test_utilities::test_set_ref_device();
+    sum_scalars(a_main, result_ref, &cfg).unwrap();
+
+    assert_eq!(result_main.as_slice(), result_ref.as_slice());
+
+    stream
+        .destroy()
+        .unwrap();
+}
+
+pub fn check_vec_ops_scalars_product<F: FieldImpl>(test_size: usize)
+where
+    <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
+{
+    let a_main = F::Config::generate_random(test_size);
+    let mut result_main = vec![F::zero(); test_size];
+    let mut result_ref = vec![F::zero(); test_size];
+
+    let a_main = HostSlice::from_slice(&a_main);
+    let result_main = HostSlice::from_mut_slice(&mut result_main);
+    let result_ref = HostSlice::from_mut_slice(&mut result_ref);
+
+    let mut stream = IcicleStream::create().unwrap();
+    let mut cfg = VecOpsConfig::default();
+    cfg.stream_handle = *stream;
+
+    test_utilities::test_set_main_device();
+    product_scalars(a_main, result_main, &cfg).unwrap();
+
+    test_utilities::test_set_ref_device();
+    product_scalars(a_main, result_ref, &cfg).unwrap();
+
+    assert_eq!(result_main.as_slice(), result_ref.as_slice());
+
+    stream
+        .destroy()
+        .unwrap();
+}
+
+pub fn check_vec_ops_scalars_add_scalar<F: FieldImpl>(test_size: usize)
+where
+    <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
+{
+    let a_main = F::Config::generate_random(1);
+    let b = F::Config::generate_random(test_size);
+    let mut result_main = vec![F::zero(); test_size];
+    let mut result_ref = vec![F::zero(); test_size];
+
+    let a_main = HostSlice::from_slice(&a_main);
+    let b = HostSlice::from_slice(&b);
+    let result_main = HostSlice::from_mut_slice(&mut result_main);
+    let result_ref = HostSlice::from_mut_slice(&mut result_ref);
+
+    let mut stream = IcicleStream::create().unwrap();
+    let mut cfg = VecOpsConfig::default();
+    cfg.stream_handle = *stream;
+    cfg.batch_size = 1;
+
+    test_utilities::test_set_main_device();
+    scalar_add(a_main, b, result_main, &cfg).unwrap();
+
+    test_utilities::test_set_ref_device();
+    scalar_add(a_main, b, result_ref, &cfg).unwrap();
+
+    assert_eq!(result_main.as_slice(), result_ref.as_slice());
+
+    stream
+        .destroy()
+        .unwrap();
+}
+
+pub fn check_vec_ops_scalars_sub_scalar<F: FieldImpl>(test_size: usize)
+where
+    <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
+{
+    let a_main = F::Config::generate_random(1);
+    let b = F::Config::generate_random(test_size);
+    let mut result_main = vec![F::zero(); test_size];
+    let mut result_ref = vec![F::zero(); test_size];
+
+    let a_main = HostSlice::from_slice(&a_main);
+    let b = HostSlice::from_slice(&b);
+    let result_main = HostSlice::from_mut_slice(&mut result_main);
+    let result_ref = HostSlice::from_mut_slice(&mut result_ref);
+
+    let mut stream = IcicleStream::create().unwrap();
+    let mut cfg = VecOpsConfig::default();
+    cfg.stream_handle = *stream;
+    cfg.batch_size = 1;
+
+    test_utilities::test_set_main_device();
+    scalar_sub(a_main, b, result_main, &cfg).unwrap();
+
+    test_utilities::test_set_ref_device();
+    scalar_sub(a_main, b, result_ref, &cfg).unwrap();
+
+    assert_eq!(result_main.as_slice(), result_ref.as_slice());
+
+    stream
+        .destroy()
+        .unwrap();
+}
+
+pub fn check_vec_ops_scalars_mul_scalar<F: FieldImpl>(test_size: usize)
+where
+    <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
+{
+    let a_main = F::Config::generate_random(1);
+    let b = F::Config::generate_random(test_size);
+    let mut result_main = vec![F::zero(); test_size];
+    let mut result_ref = vec![F::zero(); test_size];
+
+    let a_main = HostSlice::from_slice(&a_main);
+    let b = HostSlice::from_slice(&b);
+    let result_main = HostSlice::from_mut_slice(&mut result_main);
+    let result_ref = HostSlice::from_mut_slice(&mut result_ref);
+
+    let mut stream = IcicleStream::create().unwrap();
+    let mut cfg = VecOpsConfig::default();
+    cfg.stream_handle = *stream;
+    cfg.batch_size = 1;
+
+    test_utilities::test_set_main_device();
+    scalar_mul(a_main, b, result_main, &cfg).unwrap();
+
+    test_utilities::test_set_ref_device();
+    scalar_mul(a_main, b, result_ref, &cfg).unwrap();
+
+    assert_eq!(result_main.as_slice(), result_ref.as_slice());
+
+    stream
+        .destroy()
+        .unwrap();
+}
+
 pub fn check_vec_ops_scalars_accumulate<F: FieldImpl>(test_size: usize)
 where
     <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
@@ -205,6 +396,47 @@ where
     assert_eq!(result_main, result_ref);
 }
 
+pub fn check_slice<F: FieldImpl>()
+where
+    <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
+{
+    let size_in: u64 = 1 << 10;
+    let offset: u64 = 10;
+    let stride: u64 = 3;
+    let size_out: u64 = ((size_in - offset) / stride) - 1;
+
+    let input_matrix = F::Config::generate_random(size_in as usize);
+    let mut result_main = vec![F::zero(); size_out as usize];
+    let mut result_ref = vec![F::zero(); size_out as usize];
+
+    let cfg = VecOpsConfig::default();
+    test_utilities::test_set_main_device();
+    slice(
+        HostSlice::from_slice(&input_matrix),
+        offset,
+        stride,
+        size_in,
+        size_out,
+        &cfg,
+        HostSlice::from_mut_slice(&mut result_main),
+    )
+    .unwrap();
+
+    test_utilities::test_set_ref_device();
+    slice(
+        HostSlice::from_slice(&input_matrix),
+        offset,
+        stride,
+        size_in,
+        size_out,
+        &cfg,
+        HostSlice::from_mut_slice(&mut result_ref),
+    )
+    .unwrap();
+
+    assert_eq!(result_main, result_ref);
+}
+
 pub fn check_bit_reverse<F: FieldImpl>()
 where
     <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,

From 0c25f75bb0d41b39dde11c8f218cc0b8c4816986 Mon Sep 17 00:00:00 2001
From: Yuval Shekel <yshekel@gmail.com>
Date: Mon, 4 Nov 2024 20:33:32 +0200
Subject: [PATCH 41/43] formatting rust

---
 wrappers/rust/icicle-core/src/vec_ops/mod.rs   | 4 +---
 wrappers/rust/icicle-core/src/vec_ops/tests.rs | 5 +++--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
index 277846ee8..58e571d52 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
@@ -357,7 +357,6 @@ where
     <<F as FieldImpl>::Config as VecOps<F>>::slice(input, offset, stride, size_in, size_out, &cfg, output)
 }
 
-
 #[macro_export]
 macro_rules! impl_vec_ops_field {
     (
@@ -425,7 +424,6 @@ macro_rules! impl_vec_ops_field {
                     result: *mut $field,
                 ) -> eIcicleError;
 
-
                 #[link_name = concat!($field_prefix, "_vector_product")]
                 pub(crate) fn vector_product_ffi(
                     a: *const $field,
@@ -561,7 +559,7 @@ macro_rules! impl_vec_ops_field {
                     .wrap()
                 }
             }
-            
+
             fn div(
                 a: &(impl HostOrDeviceSlice<$field> + ?Sized),
                 b: &(impl HostOrDeviceSlice<$field> + ?Sized),
diff --git a/wrappers/rust/icicle-core/src/vec_ops/tests.rs b/wrappers/rust/icicle-core/src/vec_ops/tests.rs
index 4a16fcb21..0dbd4c9a3 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/tests.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/tests.rs
@@ -2,8 +2,9 @@
 use crate::test_utilities;
 use crate::traits::GenerateRandom;
 use crate::vec_ops::{
-    accumulate_scalars, add_scalars, bit_reverse, bit_reverse_inplace, mul_scalars, slice, div_scalars, sum_scalars, scalar_add, scalar_sub, scalar_mul, product_scalars, sub_scalars, transpose_matrix,
-    FieldImpl, VecOps, VecOpsConfig,
+    accumulate_scalars, add_scalars, bit_reverse, bit_reverse_inplace, div_scalars, mul_scalars, product_scalars,
+    scalar_add, scalar_mul, scalar_sub, slice, sub_scalars, sum_scalars, transpose_matrix, FieldImpl, VecOps,
+    VecOpsConfig,
 };
 use icicle_runtime::device::Device;
 use icicle_runtime::memory::{DeviceVec, HostSlice};

From dd6833b6760a18a8f1f49d16bedbfa7294c9e0dd Mon Sep 17 00:00:00 2001
From: Yuval Shekel <yshekel@gmail.com>
Date: Mon, 4 Nov 2024 22:43:48 +0200
Subject: [PATCH 42/43] extension field vec ops

---
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp  |  25 ++--
 .../include/icicle/backend/vec_ops_backend.h  |  77 ++++++++++--
 icicle/src/vec_ops.cpp                        | 115 +++++++++++++++++-
 3 files changed, 191 insertions(+), 26 deletions(-)

diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index 913793ef5..22c257023 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -491,15 +491,6 @@ eIcicleError cpu_convert_montgomery(
 
 REGISTER_CONVERT_MONTGOMERY_BACKEND("CPU", cpu_convert_montgomery<scalar_t>);
 
-#ifdef EXT_FIELD
-REGISTER_VECTOR_ADD_EXT_FIELD_BACKEND("CPU", cpu_vector_add<extension_t>);
-REGISTER_VECTOR_ACCUMULATE_EXT_FIELD_BACKEND("CPU", cpu_vector_accumulate<extension_t>);
-REGISTER_VECTOR_SUB_EXT_FIELD_BACKEND("CPU", cpu_vector_sub<extension_t>);
-REGISTER_VECTOR_MUL_EXT_FIELD_BACKEND("CPU", cpu_vector_mul<extension_t>);
-REGISTER_VECTOR_DIV_EXT_FIELD_BACKEND("CPU", cpu_vector_div<extension_t>);
-REGISTER_CONVERT_MONTGOMERY_EXT_FIELD_BACKEND("CPU", cpu_convert_montgomery<extension_t>);
-#endif // EXT_FIELD
-
 /*********************************** SUM ***********************************/
 
 template <typename T>
@@ -934,4 +925,18 @@ eIcicleError cpu_poly_divide(
   return eIcicleError::SUCCESS;
 }
 
-REGISTER_POLYNOMIAL_DIVISION("CPU", cpu_poly_divide<scalar_t>);
\ No newline at end of file
+REGISTER_POLYNOMIAL_DIVISION("CPU", cpu_poly_divide<scalar_t>);
+
+#ifdef EXT_FIELD
+REGISTER_VECTOR_ADD_EXT_FIELD_BACKEND("CPU", cpu_vector_add<extension_t>);
+REGISTER_VECTOR_ACCUMULATE_EXT_FIELD_BACKEND("CPU", cpu_vector_accumulate<extension_t>);
+REGISTER_VECTOR_SUB_EXT_FIELD_BACKEND("CPU", cpu_vector_sub<extension_t>);
+REGISTER_VECTOR_MUL_EXT_FIELD_BACKEND("CPU", cpu_vector_mul<extension_t>);
+REGISTER_VECTOR_DIV_EXT_FIELD_BACKEND("CPU", cpu_vector_div<extension_t>);
+REGISTER_CONVERT_MONTGOMERY_EXT_FIELD_BACKEND("CPU", cpu_convert_montgomery<extension_t>);
+REGISTER_VECTOR_SUM_EXT_FIELD_BACKEND("CPU", cpu_vector_sum<extension_t>);
+REGISTER_VECTOR_PRODUCT_EXT_FIELD_BACKEND("CPU", cpu_vector_product<extension_t>);
+REGISTER_SCALAR_MUL_VEC_EXT_FIELD_BACKEND("CPU", cpu_scalar_mul<extension_t>);
+REGISTER_SCALAR_ADD_VEC_EXT_FIELD_BACKEND("CPU", cpu_scalar_add<extension_t>);
+REGISTER_SCALAR_SUB_VEC_EXT_FIELD_BACKEND("CPU", cpu_scalar_sub<extension_t>);
+#endif // EXT_FIELD
\ No newline at end of file
diff --git a/icicle/include/icicle/backend/vec_ops_backend.h b/icicle/include/icicle/backend/vec_ops_backend.h
index 36b41760e..3739fb780 100644
--- a/icicle/include/icicle/backend/vec_ops_backend.h
+++ b/icicle/include/icicle/backend/vec_ops_backend.h
@@ -7,14 +7,6 @@ using namespace field_config;
 namespace icicle {
   /*************************** Backend registration ***************************/
 
-  using vectorVectorOpImpl = std::function<eIcicleError(
-    const Device& device,
-    const scalar_t* scalar_a,
-    const scalar_t* vec_b,
-    uint64_t size,
-    const VecOpsConfig& config,
-    scalar_t* output)>;
-
   using vectorVectorOpImplInplaceA = std::function<eIcicleError(
     const Device& device, scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config)>;
 
@@ -82,7 +74,7 @@ namespace icicle {
     scalar_t* r_out /*OUT*/,
     uint64_t r_size)>;
 
-  void register_vector_add(const std::string& deviceType, vectorVectorOpImpl impl);
+  void register_vector_add(const std::string& deviceType, scalarVectorOpImpl impl);
 
 #define REGISTER_VECTOR_ADD_BACKEND(DEVICE_TYPE, FUNC)                                                                 \
   namespace {                                                                                                          \
@@ -102,7 +94,7 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  void register_vector_sub(const std::string& deviceType, vectorVectorOpImpl impl);
+  void register_vector_sub(const std::string& deviceType, scalarVectorOpImpl impl);
 #define REGISTER_VECTOR_SUB_BACKEND(DEVICE_TYPE, FUNC)                                                                 \
   namespace {                                                                                                          \
     static bool UNIQUE(_reg_vec_sub) = []() -> bool {                                                                  \
@@ -111,7 +103,7 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  void register_vector_mul(const std::string& deviceType, vectorVectorOpImpl impl);
+  void register_vector_mul(const std::string& deviceType, scalarVectorOpImpl impl);
 
 #define REGISTER_VECTOR_MUL_BACKEND(DEVICE_TYPE, FUNC)                                                                 \
   namespace {                                                                                                          \
@@ -121,7 +113,7 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  void register_vector_div(const std::string& deviceType, vectorVectorOpImpl impl);
+  void register_vector_div(const std::string& deviceType, scalarVectorOpImpl impl);
 
 #define REGISTER_VECTOR_DIV_BACKEND(DEVICE_TYPE, FUNC)                                                                 \
   namespace {                                                                                                          \
@@ -263,6 +255,17 @@ namespace icicle {
   using extFieldVectorOpImplInplaceA = std::function<eIcicleError(
     const Device& device, extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config)>;
 
+  using extFieldVectorReduceOpImpl = std::function<eIcicleError(
+    const Device& device, const extension_t* vec_a, uint64_t size, const VecOpsConfig& config, extension_t* output)>;
+
+  using extFieldVectorOpImpl = std::function<eIcicleError(
+    const Device& device,
+    const extension_t* scalar_a,
+    const extension_t* vec_b,
+    uint64_t size,
+    const VecOpsConfig& config,
+    extension_t* output)>;
+
   void register_extension_vector_add(const std::string& deviceType, extFieldVectorOpImpl impl);
 
   #define REGISTER_VECTOR_ADD_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC)                                                     \
@@ -312,6 +315,56 @@ namespace icicle {
       }();                                                                                                             \
     }
 
+  void register_extension_scalar_mul_vec(const std::string& deviceType, extFieldVectorOpImpl impl);
+
+  #define REGISTER_SCALAR_MUL_VEC_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC)                                                 \
+    namespace {                                                                                                        \
+      static bool UNIQUE(_reg_scalar_mul_vec_ext_field) = []() -> bool {                                               \
+        register_extension_scalar_mul_vec(DEVICE_TYPE, FUNC);                                                          \
+        return true;                                                                                                   \
+      }();                                                                                                             \
+    }
+
+  void register_extension_scalar_add_vec(const std::string& deviceType, extFieldVectorOpImpl impl);
+
+  #define REGISTER_SCALAR_ADD_VEC_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC)                                                 \
+    namespace {                                                                                                        \
+      static bool UNIQUE(_reg_scalar_add_vec_ext_field) = []() -> bool {                                               \
+        register_extension_scalar_add_vec(DEVICE_TYPE, FUNC);                                                          \
+        return true;                                                                                                   \
+      }();                                                                                                             \
+    }
+
+  void register_extension_scalar_sub_vec(const std::string& deviceType, extFieldVectorOpImpl impl);
+
+  #define REGISTER_SCALAR_SUB_VEC_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC)                                                 \
+    namespace {                                                                                                        \
+      static bool UNIQUE(_reg_scalar_sub_vec_ext_field) = []() -> bool {                                               \
+        register_extension_scalar_sub_vec(DEVICE_TYPE, FUNC);                                                          \
+        return true;                                                                                                   \
+      }();                                                                                                             \
+    }
+
+  void register_extension_vector_sum(const std::string& deviceType, extFieldVectorReduceOpImpl impl);
+
+  #define REGISTER_VECTOR_SUM_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC)                                                     \
+    namespace {                                                                                                        \
+      static bool UNIQUE(_reg_vec_sum_ext_field) = []() -> bool {                                                      \
+        register_extension_vector_sum(DEVICE_TYPE, FUNC);                                                              \
+        return true;                                                                                                   \
+      }();                                                                                                             \
+    }
+
+  void register_extension_vector_product(const std::string& deviceType, extFieldVectorReduceOpImpl impl);
+
+  #define REGISTER_VECTOR_PRODUCT_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC)                                                 \
+    namespace {                                                                                                        \
+      static bool UNIQUE(_reg_vec_product_ext_field) = []() -> bool {                                                  \
+        register_extension_vector_product(DEVICE_TYPE, FUNC);                                                          \
+        return true;                                                                                                   \
+      }();                                                                                                             \
+    }
+
   using extFieldConvertMontgomeryImpl = std::function<eIcicleError(
     const Device& device,
     const extension_t* input,
diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp
index 5c56facf8..6e159074f 100644
--- a/icicle/src/vec_ops.cpp
+++ b/icicle/src/vec_ops.cpp
@@ -18,6 +18,22 @@ namespace icicle {
     return CONCAT_EXPAND(FIELD, vector_product)(vec_a, size, &config, output);
   }
 
+#ifdef EXT_FIELD
+  ICICLE_DISPATCHER_INST(VectorProductExtFieldDispatcher, extension_vector_product, extFieldVectorReduceOpImpl);
+
+  extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_product)(
+    const extension_t* vec_a, uint64_t size, const VecOpsConfig* config, extension_t* output)
+  {
+    return VectorProductExtFieldDispatcher::execute(vec_a, size, *config, output);
+  }
+
+  template <>
+  eIcicleError vector_product(const extension_t* vec_a, uint64_t size, const VecOpsConfig& config, extension_t* output)
+  {
+    return CONCAT_EXPAND(FIELD, extension_vector_product)(vec_a, size, &config, output);
+  }
+#endif // EXT_FIELD
+
   /*********************************** REDUCE SUM ****************************/
   ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, VectorReduceOpImpl);
 
@@ -33,8 +49,24 @@ namespace icicle {
     return CONCAT_EXPAND(FIELD, vector_sum)(vec_a, size, &config, output);
   }
 
+#ifdef EXT_FIELD
+  ICICLE_DISPATCHER_INST(VectorSumExtFieldDispatcher, extension_vector_sum, extFieldVectorReduceOpImpl);
+
+  extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_sum)(
+    const extension_t* vec_a, uint64_t size, const VecOpsConfig* config, extension_t* output)
+  {
+    return VectorSumExtFieldDispatcher::execute(vec_a, size, *config, output);
+  }
+
+  template <>
+  eIcicleError vector_sum(const extension_t* vec_a, uint64_t size, const VecOpsConfig& config, extension_t* output)
+  {
+    return CONCAT_EXPAND(FIELD, extension_vector_sum)(vec_a, size, &config, output);
+  }
+#endif // EXT_FIELD
+
   /*********************************** ADD ***********************************/
-  ICICLE_DISPATCHER_INST(VectorAddDispatcher, vector_add, vectorVectorOpImpl);
+  ICICLE_DISPATCHER_INST(VectorAddDispatcher, vector_add, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_add)(
     const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output)
@@ -99,7 +131,7 @@ namespace icicle {
 #endif // EXT_FIELD
 
   /*********************************** SUB ***********************************/
-  ICICLE_DISPATCHER_INST(VectorSubDispatcher, vector_sub, vectorVectorOpImpl);
+  ICICLE_DISPATCHER_INST(VectorSubDispatcher, vector_sub, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_sub)(
     const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output)
@@ -132,7 +164,7 @@ namespace icicle {
 #endif // EXT_FIELD
 
   /*********************************** MUL ***********************************/
-  ICICLE_DISPATCHER_INST(VectorMulDispatcher, vector_mul, vectorVectorOpImpl);
+  ICICLE_DISPATCHER_INST(VectorMulDispatcher, vector_mul, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_mul)(
     const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output)
@@ -165,7 +197,7 @@ namespace icicle {
 #endif // EXT_FIELD
 
   /*********************************** DIV ***********************************/
-  ICICLE_DISPATCHER_INST(VectorDivDispatcher, vector_div, vectorVectorOpImpl);
+  ICICLE_DISPATCHER_INST(VectorDivDispatcher, vector_div, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_div)(
     const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output)
@@ -213,6 +245,31 @@ namespace icicle {
     return CONCAT_EXPAND(FIELD, scalar_add_vec)(scalar_a, vec_b, size, &config, output);
   }
 
+#ifdef EXT_FIELD
+  ICICLE_DISPATCHER_INST(ScalarAddExtFieldDispatcher, extension_scalar_add_vec, extFieldVectorOpImpl);
+
+  extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_scalar_add_vec)(
+    const extension_t* scalar_a,
+    const extension_t* vec_b,
+    uint64_t size,
+    const VecOpsConfig* config,
+    extension_t* output)
+  {
+    return ScalarAddExtFieldDispatcher::execute(scalar_a, vec_b, size, *config, output);
+  }
+
+  template <>
+  eIcicleError scalar_add_vec(
+    const extension_t* scalar_a,
+    const extension_t* vec_b,
+    uint64_t size,
+    const VecOpsConfig& config,
+    extension_t* output)
+  {
+    return CONCAT_EXPAND(FIELD, extension_scalar_add_vec)(scalar_a, vec_b, size, &config, output);
+  }
+#endif // EXT_FIELD
+
   /*********************************** (Scalar - Vector) ELEMENT WISE ***********************************/
   ICICLE_DISPATCHER_INST(ScalarSubDispatcher, scalar_sub_vec, scalarVectorOpImpl);
 
@@ -228,6 +285,31 @@ namespace icicle {
   {
     return CONCAT_EXPAND(FIELD, scalar_sub_vec)(scalar_a, vec_b, size, &config, output);
   }
+
+#ifdef EXT_FIELD
+  ICICLE_DISPATCHER_INST(ScalarSubExtFieldDispatcher, extension_scalar_sub_vec, extFieldVectorOpImpl);
+
+  extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_scalar_sub_vec)(
+    const extension_t* scalar_a,
+    const extension_t* vec_b,
+    uint64_t size,
+    const VecOpsConfig* config,
+    extension_t* output)
+  {
+    return ScalarSubExtFieldDispatcher::execute(scalar_a, vec_b, size, *config, output);
+  }
+
+  template <>
+  eIcicleError scalar_sub_vec(
+    const extension_t* scalar_a,
+    const extension_t* vec_b,
+    uint64_t size,
+    const VecOpsConfig& config,
+    extension_t* output)
+  {
+    return CONCAT_EXPAND(FIELD, extension_scalar_sub_vec)(scalar_a, vec_b, size, &config, output);
+  }
+#endif // EXT_FIELD
   /*********************************** MUL BY SCALAR ***********************************/
   ICICLE_DISPATCHER_INST(ScalarMulDispatcher, scalar_mul_vec, scalarVectorOpImpl);
 
@@ -244,6 +326,31 @@ namespace icicle {
     return CONCAT_EXPAND(FIELD, scalar_mul_vec)(scalar_a, vec_b, size, &config, output);
   }
 
+#ifdef EXT_FIELD
+  ICICLE_DISPATCHER_INST(ScalarMulExtFieldDispatcher, extension_scalar_mul_vec, extFieldVectorOpImpl);
+
+  extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_scalar_mul_vec)(
+    const extension_t* scalar_a,
+    const extension_t* vec_b,
+    uint64_t size,
+    const VecOpsConfig* config,
+    extension_t* output)
+  {
+    return ScalarMulExtFieldDispatcher::execute(scalar_a, vec_b, size, *config, output);
+  }
+
+  template <>
+  eIcicleError scalar_mul_vec(
+    const extension_t* scalar_a,
+    const extension_t* vec_b,
+    uint64_t size,
+    const VecOpsConfig& config,
+    extension_t* output)
+  {
+    return CONCAT_EXPAND(FIELD, extension_scalar_mul_vec)(scalar_a, vec_b, size, &config, output);
+  }
+#endif // EXT_FIELD
+
   /*********************************** CONVERT MONTGOMERY ***********************************/
 
   ICICLE_DISPATCHER_INST(ScalarConvertMontgomeryDispatcher, scalar_convert_montgomery, scalarConvertMontgomeryImpl)

From fbb9f5506e30677a45d8a6cb13bb91fe4059aac6 Mon Sep 17 00:00:00 2001
From: Yuval Shekel <yshekel@gmail.com>
Date: Mon, 4 Nov 2024 22:44:06 +0200
Subject: [PATCH 43/43] release script build v3.1

---
 scripts/release/build_all.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/release/build_all.sh b/scripts/release/build_all.sh
index cbb4b8860..b8050fb70 100755
--- a/scripts/release/build_all.sh
+++ b/scripts/release/build_all.sh
@@ -32,25 +32,25 @@ docker run --rm --gpus all                  \
             -v ./icicle:/icicle             \
             -v "$output_dir:/output"        \
             -v ./scripts:/scripts           \
-            icicle-release-ubuntu22-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubuntu22 cuda122 &
+            icicle-release-ubuntu22-cuda122 bash /scripts/release/build_release_and_tar.sh icicle_3_1_0 ubuntu22 cuda122 &
 
 # ubuntu 20
 docker run --rm --gpus all                  \
             -v ./icicle:/icicle             \
             -v "$output_dir:/output"        \
             -v ./scripts:/scripts           \
-            icicle-release-ubuntu20-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubuntu20 cuda122 &
+            icicle-release-ubuntu20-cuda122 bash /scripts/release/build_release_and_tar.sh icicle_3_1_0 ubuntu20 cuda122 &
 
 # ubi 8 (rhel compatible)
 docker run --rm --gpus all                  \
             -v ./icicle:/icicle             \
             -v "$output_dir:/output"        \
             -v ./scripts:/scripts           \
-            icicle-release-ubi8-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubi8 cuda122 &
+            icicle-release-ubi8-cuda122 bash /scripts/release/build_release_and_tar.sh icicle_3_1_0 ubi8 cuda122 &
 
 # ubi 9 (rhel compatible)
 docker run --rm --gpus all                  \
             -v ./icicle:/icicle             \
             -v "$output_dir:/output"        \
             -v ./scripts:/scripts           \
-            icicle-release-ubi9-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubi9 cuda122 &
+            icicle-release-ubi9-cuda122 bash /scripts/release/build_release_and_tar.sh icicle_3_1_0 ubi9 cuda122 &