diff --git a/docs/docs/icicle/golang-bindings/vec-ops.md b/docs/docs/icicle/golang-bindings/vec-ops.md
index e93d9a0a2..e219ec26d 100644
--- a/docs/docs/icicle/golang-bindings/vec-ops.md
+++ b/docs/docs/icicle/golang-bindings/vec-ops.md
@@ -4,8 +4,8 @@
 
 Icicle exposes a number of vector operations which a user can use:
 
-* The VecOps API provides efficient vector operations such as addition, subtraction, and multiplication.
-* MatrixTranspose API allows a user to perform a transpose on a vector representation of a matrix
+* The VecOps API provides efficient vector operations such as addition, subtraction, and multiplication, supporting both single and batched operations.
+* MatrixTranspose API allows a user to perform a transpose on a vector representation of a matrix, with support for batched transpositions.
 
 ## VecOps API Documentation
 
@@ -121,6 +121,8 @@ type VecOpsConfig struct {
 	isBOnDevice      bool
 	isResultOnDevice bool
 	IsAsync          bool
+	batch_size       int
+	columns_batch    bool
 	Ext              config_extension.ConfigExtensionHandler
 }
 ```
@@ -132,6 +134,8 @@ type VecOpsConfig struct {
 - **`isBOnDevice`**: Indicates if vector `b` is located on the device.
 - **`isResultOnDevice`**: Specifies where the result vector should be stored (device or host memory).
 - **`IsAsync`**: Controls whether the vector operation runs asynchronously.
+- **`batch_size`**: Number of vectors (or operations) to process in a batch. Each vector operation will be performed independently on each batch element.
+- **`columns_batch`**: true if the batched vectors are stored as columns in a 2D array (i.e., the vectors are strided in memory as columns of a matrix). If false, the batched vectors are stored contiguously in memory (e.g., as rows or in a flat array).
 - **`Ext`**: Extended configuration for backend.
 
 #### Default Configuration
@@ -148,6 +152,8 @@ This section describes the functionality of the `TransposeMatrix` function used
 
 The function takes a matrix represented as a 1D slice and transposes it, storing the result in another 1D slice.
 
+If VecOpsConfig specifies a batch_size greater than one, the transposition is performed on multiple matrices simultaneously, producing corresponding transposed matrices. The storage arrangement of batched matrices is determined by the columns_batch field in the VecOpsConfig.
+
 ### Function
 
 ```go
diff --git a/docs/docs/icicle/primitives/vec_ops.md b/docs/docs/icicle/primitives/vec_ops.md
index e9e10c1a9..7f546dc16 100644
--- a/docs/docs/icicle/primitives/vec_ops.md
+++ b/docs/docs/icicle/primitives/vec_ops.md
@@ -16,6 +16,8 @@ The `VecOpsConfig` struct is a configuration object used to specify parameters f
 - **`is_b_on_device: bool`**: Indicates whether the second input vector (`b`) is already on the device. If `false`, the vector will be copied from the host to the device. This field is optional.
 - **`is_result_on_device: bool`**: Indicates whether the result should be stored on the device. If `false`, the result will be transferred back to the host.
 - **`is_async: bool`**: Specifies whether the vector operation should be performed asynchronously. When `true`, the operation will not block the CPU, allowing other operations to proceed concurrently. Asynchronous execution requires careful synchronization to ensure data integrity.
+- **`batch_size: int`**: Number of vectors (or operations) to process in a batch. Each vector operation will be performed independently on each batch element.
+- **`columns_batch: bool`**: True if the batched vectors are stored as columns in a 2D array (i.e., the vectors are strided in memory as columns of a matrix). If false, the batched vectors are stored contiguously in memory (e.g., as rows or in a flat array).
 - **`ext: ConfigExtension*`**: Backend-specific extensions.
 
 #### Default Configuration
@@ -28,6 +30,9 @@ static VecOpsConfig default_vec_ops_config() {
       false,   // is_b_on_device
       false,   // is_result_on_device
       false,   // is_async
+      1,       // batch_size
+      false,   // columns_batch
+      nullptr  // ext
     };
     return config;
 }
@@ -35,7 +40,7 @@ static VecOpsConfig default_vec_ops_config() {
 
 ### Element-wise Operations
 
-These functions perform element-wise operations on two input vectors `a` and `b`, producing an output vector.
+These functions perform element-wise operations on two input vectors a and b. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple pairs of vectors simultaneously, producing corresponding output vectors.
 
 #### `vector_add`
 
@@ -90,9 +95,31 @@ template <typename T>
 eIcicleError convert_montgomery(const T* input, uint64_t size, bool is_into, const VecOpsConfig& config, T* output);
 ```
 
+### Reduction operations
+
+These functions perform reduction operations on vectors. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple vectors simultaneously, producing corresponding output values. The storage arrangement of batched vectors is determined by the columns_batch field in the VecOpsConfig.
+
+#### `vector_sum`
+
+Computes the sum of all elements in each vector in a batch.
+
+```cpp
+template <typename T>
+eIcicleError vector_sum(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output);
+```
+
+#### `vector_product`
+
+Computes the product of all elements in each vector in a batch.
+
+```cpp
+template <typename T>
+eIcicleError vector_product(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output);
+```
+
 ### Scalar-Vector Operations
 
-These functions apply a scalar operation to each element of a vector.
+These functions apply a scalar operation to each element of a vector. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple vector-scalar pairs simultaneously, producing corresponding output vectors.
 
 #### `scalar_add_vec / scalar_sub_vec`
 
@@ -123,7 +150,7 @@ eIcicleError scalar_mul_vec(const T* scalar_a, const T* vec_b, uint64_t size, co
 
 ### Matrix Operations
 
-These functions perform operations on matrices.
+These functions perform operations on matrices. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple matrices simultaneously, producing corresponding output matrices.
 
 #### `matrix_transpose`
 
@@ -138,7 +165,7 @@ eIcicleError matrix_transpose(const T* mat_in, uint32_t nof_rows, uint32_t nof_c
 
 #### `bit_reverse`
 
-Reorders the vector elements based on a bit-reversal pattern.
+Reorders the vector elements based on a bit-reversal pattern. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously.
 
 ```cpp
 template <typename T>
@@ -147,16 +174,16 @@ eIcicleError bit_reverse(const T* vec_in, uint64_t size, const VecOpsConfig& con
 
 #### `slice`
 
-Extracts a slice from a vector.
+Extracts a slice from a vector. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously, producing corresponding output vectors.
 
 ```cpp
 template <typename T>
-eIcicleError slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size, const VecOpsConfig& config, T* vec_out);
+eIcicleError slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size_in, uint64_t size_out, const VecOpsConfig& config, T* vec_out);
 ```
 
 #### `highest_non_zero_idx`
 
-Finds the highest non-zero index in a vector.
+Finds the highest non-zero index in a vector. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously.
 
 ```cpp
 template <typename T>
@@ -165,7 +192,7 @@ eIcicleError highest_non_zero_idx(const T* vec_in, uint64_t size, const VecOpsCo
 
 #### `polynomial_eval`
 
-Evaluates a polynomial at given domain points.
+Evaluates a polynomial at given domain points. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously.
 
 ```cpp
 template <typename T>
@@ -174,7 +201,7 @@ eIcicleError polynomial_eval(const T* coeffs, uint64_t coeffs_size, const T* dom
 
 #### `polynomial_division`
 
-Divides two polynomials.
+Divides two polynomials. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously.
 
 ```cpp
 template <typename T>
diff --git a/docs/docs/icicle/programmers_guide/general.md b/docs/docs/icicle/programmers_guide/general.md
index b02cd2f9c..0bef2b850 100644
--- a/docs/docs/icicle/programmers_guide/general.md
+++ b/docs/docs/icicle/programmers_guide/general.md
@@ -21,6 +21,7 @@ The configuration struct allows users to modify settings such as:
 
 - Specifying whether inputs and outputs are on the host or device.
 - Adjusting the data layout for specific optimizations.
+- Setting batching parameters (batch_size and columns_batch) to perform operations on multiple data sets simultaneously.
 - Passing custom options to the backend implementation through an extension mechanism, such as setting the number of CPU cores to use.
 
 ### Example (C++)
@@ -31,6 +32,8 @@ The configuration struct allows users to modify settings such as:
 // Create config struct for vector add
 VecOpsConfig config = default_vec_ops_config();
 // optionally modify the config struct here
+config.batch_size = 4;          // Process 4 vector operations in a batch
+config.columns_batch = true;    // Batched vectors are stored as columns
 
 // Call the API
 eIcicleError err = vector_add(vec_a, vec_b, size, config, vec_res);
@@ -45,6 +48,8 @@ struct VecOpsConfig {
     bool is_b_on_device;       /**< True if `b` is on the device, false if it is not. Default value: false. OPTIONAL. */
     bool is_result_on_device;  /**< If true, the output is preserved on the device, otherwise on the host. Default value: false. */
     bool is_async;             /**< Whether to run the vector operations asynchronously. */
+    int batch_size;            /**< Number of vector operations to process in a batch. Default value: 1. */
+    bool columns_batch;        /**< True if batched vectors are stored as columns; false if stored contiguously. Default value: false. */
     ConfigExtension* ext = nullptr; /**< Backend-specific extension. */
 };
 ```
diff --git a/docs/docs/icicle/rust-bindings/vec-ops.md b/docs/docs/icicle/rust-bindings/vec-ops.md
index 61aa71570..c42caafb5 100644
--- a/docs/docs/icicle/rust-bindings/vec-ops.md
+++ b/docs/docs/icicle/rust-bindings/vec-ops.md
@@ -1,10 +1,10 @@
 # Vector Operations API
 
-Our vector operations API includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory.
+Our vector operations API includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory, as well as batched operations.
 
 ## Vector Operations Configuration
 
-The `VecOpsConfig` struct encapsulates the settings for vector operations, including device context and operation modes.
+The `VecOpsConfig` struct encapsulates the settings for vector operations, including device context, operation modes, and batching parameters.
 
 ### `VecOpsConfig`
 
@@ -17,6 +17,8 @@ pub struct VecOpsConfig {
     pub is_b_on_device: bool,
     pub is_result_on_device: bool,
     pub is_async: bool,
+    pub batch_size: usize,
+    pub columns_batch: bool,
     pub ext: ConfigExtension,
 }
 ```
@@ -28,6 +30,9 @@ pub struct VecOpsConfig {
 - **`is_b_on_device: bool`**: Indicates whether the input b data has been preloaded on the device memory. If `false` inputs will be copied from host to device.
 - **`is_result_on_device: bool`**: Indicates whether the output data is preloaded in device memory. If `false` outputs will be copied from host to device.
 - **`is_async: bool`**: Specifies whether the NTT operation should be performed asynchronously.
+- **`batch_size: usize`**: Number of vector operations to process in a single batch. Each operation will be performed independently on each batch element.
+- **`columns_batch: bool`**: true if the batched vectors are stored as columns in a 2D array (i.e., the vectors are strided in memory as columns of a matrix). If false, the batched vectors are stored contiguously in memory (e.g., as rows or in a flat array).
+
 - **`ext: ConfigExtension`**: extended configuration for backend.
 
 ### Default Configuration
@@ -40,11 +45,11 @@ let cfg = VecOpsConfig::default();
 
 ## Vector Operations
 
-Vector operations are implemented through the `VecOps` trait, providing methods for addition, subtraction, and multiplication of vectors.
+Vector operations are implemented through the `VecOps` trait, providing methods for addition, subtraction, and multiplication of vectors. These methods support both single and batched operations based on the batch_size and columns_batch configurations.
 
 ### Methods
 
-All operations are element-wise operations, and the results placed into the `result` param. These operations are not in place.
+All operations are element-wise operations, and the results placed into the `result` param. These operations are not in place, except for accumulate.
 
 - **`add`**: Computes the element-wise sum of two vectors.
 - **`accumulate`**: Sum input b to a inplace.
diff --git a/examples/c++/polynomial-multiplication/example.cpp b/examples/c++/polynomial-multiplication/example.cpp
index 9bd90b842..1fdfeb501 100644
--- a/examples/c++/polynomial-multiplication/example.cpp
+++ b/examples/c++/polynomial-multiplication/example.cpp
@@ -69,21 +69,18 @@ int main(int argc, char** argv)
     ICICLE_CHECK(bn254_ntt(polyB.get(), NTT_SIZE, NTTDir::kForward, &ntt_config, d_polyB));
 
     // (4) multiply A,B
-    VecOpsConfig config{
-      nullptr,
-      true,   // is_a_on_device
-      true,   // is_b_on_device
-      true,   // is_result_on_device
-      false,  // is_async
-      nullptr // ext
-    };
-    ICICLE_CHECK(bn254_vector_mul(d_polyA, d_polyB, NTT_SIZE, &config, d_polyRes));
+    VecOpsConfig config = default_vec_ops_config();
+    config.is_a_on_device = true;
+    config.is_b_on_device = true;
+    config.is_result_on_device = true;
+
+    ICICLE_CHECK(vector_mul(d_polyA, d_polyB, NTT_SIZE, config, d_polyRes));
 
     // (5) INTT (in place)
     ntt_config.are_inputs_on_device = true;
     ntt_config.are_outputs_on_device = true;
     ntt_config.ordering = Ordering::kMN;
-    ICICLE_CHECK(bn254_ntt(d_polyRes, NTT_SIZE, NTTDir::kInverse, &ntt_config, d_polyRes));
+    ICICLE_CHECK(ntt(d_polyRes, NTT_SIZE, NTTDir::kInverse, ntt_config, d_polyRes));
 
     if (print) { END_TIMER(poly_multiply, "polynomial multiplication took"); }
 
diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index 3a2156d60..22c257023 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -6,6 +6,9 @@
 
 #include "icicle/fields/field_config.h"
 #include "tasks_manager.h"
+#include <cstdint>
+#include <sys/types.h>
+#include <vector>
 
 using namespace field_config;
 using namespace icicle;
@@ -17,15 +20,17 @@ enum VecOperation {
   VECTOR_SUB,
   VECTOR_MUL,
   VECTOR_DIV,
+  CONVERT_TO_MONTGOMERY,
+  CONVERT_FROM_MONTGOMERY,
   VECTOR_SUM,
   VECTOR_PRODUCT,
   SCALAR_ADD_VEC,
   SCALAR_SUB_VEC,
   SCALAR_MUL_VEC,
-  CONVERT_TO_MONTGOMERY,
-  CONVERT_FROM_MONTGOMERY,
   BIT_REVERSE,
   SLICE,
+  REPLACE_ELEMENTS,
+  OUT_OF_PLACE_MATRIX_TRANSPOSE,
 
   NOF_OPERATIONS
 };
@@ -46,18 +51,25 @@ class VectorOpTask : public TaskBase
   VectorOpTask() : TaskBase() {}
 
   // Set the operands to execute a task of 2 operands and 1 output and dispatch the task
-  void send_2ops_task(VecOperation operation, const int nof_operations, const T* op_a, const T* op_b, T* output)
+  void send_2ops_task(
+    VecOperation operation,
+    const uint32_t nof_operations,
+    const T* op_a,
+    const T* op_b,
+    const uint32_t stride,
+    T* output)
   {
     m_operation = operation;
     m_nof_operations = nof_operations;
     m_op_a = op_a;
     m_op_b = op_b;
+    m_stride = stride;
     m_output = output;
     dispatch();
   }
 
   // Set the operands to execute a task of 1 operand and 1 output and dispatch the task
-  void send_1op_task(VecOperation operation, const int nof_operations, const T* op_a, T* output)
+  void send_1op_task(VecOperation operation, const uint32_t nof_operations, const T* op_a, T* output)
   {
     m_operation = operation;
     m_nof_operations = nof_operations;
@@ -66,34 +78,94 @@ class VectorOpTask : public TaskBase
     dispatch();
   }
   // Set the operands to execute a task of 1 operand and dispatch the task
-  void send_intermidiate_res_task(VecOperation operation, const int nof_operations, const T* op_a)
+  void
+  send_intermidiate_res_task(VecOperation operation, const uint64_t stop_index, const T* op_a, const uint64_t stride)
   {
     m_operation = operation;
-    m_nof_operations = nof_operations;
+    m_stop_index = stop_index;
     m_op_a = op_a;
+    m_stride = stride;
     dispatch();
   }
 
-  // Set the operands to bitrev operation dispatch the task
-  void send_bitrev_task(
-    VecOperation operation, int bit_size, uint64_t start_index, const int nof_operations, const T* op_a, T* output)
+  // Set the operands for bit_reverse operation and dispatch the task
+  void send_bit_reverse_task(
+    VecOperation operation,
+    uint32_t bit_size,
+    uint64_t start_index,
+    const uint32_t nof_operations,
+    const T* op_a,
+    const uint64_t stride,
+    T* output)
   {
     m_operation = operation;
+    m_bit_size = bit_size;
+    m_start_index = start_index;
     m_nof_operations = nof_operations;
     m_op_a = op_a;
+    m_stride = stride;
     m_output = output;
-    m_bit_size = bit_size, m_start_index = start_index;
     dispatch();
   }
 
-  // Set the operands to slice operation dispatch the task
-  void send_slice_task(VecOperation operation, uint64_t stride, const int nof_operations, const T* op_a, T* output)
+  // Set the operands for slice operation and dispatch the task
+  void send_slice_task(
+    VecOperation operation,
+    uint64_t stride,
+    uint64_t stride_out,
+    const uint32_t nof_operations,
+    const T* op_a,
+    T* output)
   {
     m_operation = operation;
     m_nof_operations = nof_operations;
     m_op_a = op_a;
     m_output = output;
     m_stride = stride;
+    m_stride_out = stride_out;
+    dispatch();
+  }
+
+  // Set the operands for replace_elements operation and dispatch the task
+  void send_replace_elements_task(
+    VecOperation operation,
+    const T* mat_in,
+    const uint32_t nof_operations,
+    std::vector<uint64_t>& start_indices_in_mat,
+    uint64_t start_index,
+    uint32_t log_nof_rows,
+    uint32_t log_nof_cols,
+    const uint32_t stride,
+    T* mat_out)
+  {
+    m_operation = operation;
+    m_op_a = mat_in;
+    m_nof_operations = nof_operations;
+    m_start_indices_in_mat = &start_indices_in_mat;
+    m_start_index = start_index; // start index in start_indices vector
+    m_log_nof_rows = log_nof_rows;
+    m_log_nof_cols = log_nof_cols;
+    m_stride = stride;
+    m_output = mat_out;
+    dispatch();
+  }
+
+  void send_out_of_place_matrix_transpose_task(
+    VecOperation operation,
+    const T* mat_in,
+    const uint32_t nof_operations,
+    const uint32_t nof_rows,
+    const uint32_t nof_cols,
+    const uint32_t stride,
+    T* mat_out)
+  {
+    m_operation = operation;
+    m_op_a = mat_in;
+    m_nof_operations = nof_operations;
+    m_nof_rows = nof_rows;
+    m_nof_cols = nof_cols;
+    m_stride = stride;
+    m_output = mat_out;
     dispatch();
   }
 
@@ -130,56 +202,55 @@ class VectorOpTask : public TaskBase
       m_output[i] = m_op_a[i] * T::inverse(m_op_b[i]);
     }
   }
-  // Single worker functionality to execute scalar + vector
-  void scalar_add_vec()
-  {
-    for (uint64_t i = 0; i < m_nof_operations; ++i) {
-      m_output[i] = *m_op_a + m_op_b[i];
-    }
-  }
-  // Single worker functionality to execute scalar - vector
-  void scalar_sub_vec()
+  // Single worker functionality to execute conversion from barret to montgomery
+  void convert_to_montgomery()
   {
     for (uint64_t i = 0; i < m_nof_operations; ++i) {
-      m_output[i] = *m_op_a + m_op_b[i];
+      m_output[i] = T::to_montgomery(m_op_a[i]);
     }
   }
-  // Single worker functionality to execute scalar * vector
-  void scalar_mul_vec()
+  // Single worker functionality to execute conversion from montgomery to barret
+  void convert_from_montgomery()
   {
     for (uint64_t i = 0; i < m_nof_operations; ++i) {
-      m_output[i] = *m_op_a * m_op_b[i];
+      m_output[i] = T::from_montgomery(m_op_a[i]);
     }
   }
   // Single worker functionality to execute sum(vector)
   void vector_sum()
   {
-    *m_output = m_op_a[0];
-    for (uint64_t i = 1; i < m_nof_operations; ++i) {
-      *m_output = *m_output + m_op_a[i];
+    m_intermidiate_res = T::zero();
+    for (uint64_t i = 0; i < (m_stop_index * m_stride); i = i + m_stride) {
+      m_intermidiate_res = m_intermidiate_res + m_op_a[i];
     }
   }
   // Single worker functionality to execute product(vector)
   void vector_product()
   {
-    *m_output = m_op_a[0];
-    for (uint64_t i = 1; i < m_nof_operations; ++i) {
-      *m_output = *m_output * m_op_a[i];
+    m_intermidiate_res = T::one();
+    for (uint64_t i = 0; i < (m_stop_index * m_stride); i = i + m_stride) {
+      m_intermidiate_res = m_intermidiate_res * m_op_a[i];
     }
   }
-  // Single worker functionality to execute conversion from barret to montgomery
-  void convert_to_montgomery()
+  // Single worker functionality to execute scalar + vector
+  void scalar_add_vec()
   {
     for (uint64_t i = 0; i < m_nof_operations; ++i) {
-      m_output[i] = T::to_montgomery(m_op_a[i]);
+      m_output[m_stride * i] = *m_op_a + m_op_b[m_stride * i];
     }
   }
-
-  // Single worker functionality to execute conversion from montgomery to barret
-  void convert_from_montgomery()
+  // Single worker functionality to execute scalar - vector
+  void scalar_sub_vec()
   {
     for (uint64_t i = 0; i < m_nof_operations; ++i) {
-      m_output[i] = T::from_montgomery(m_op_a[i]);
+      m_output[m_stride * i] = *m_op_a - m_op_b[m_stride * i];
+    }
+  }
+  // Single worker functionality to execute scalar * vector
+  void scalar_mul_vec()
+  {
+    for (uint64_t i = 0; i < m_nof_operations; ++i) {
+      m_output[m_stride * i] = *m_op_a * m_op_b[m_stride * i];
     }
   }
   // Single worker functionality to execute bit reverse reorder
@@ -200,10 +271,10 @@ class VectorOpTask : public TaskBase
 
       if (m_output == m_op_a) { // inplace calculation
         if (rev_idx < idx) {    // only on of the threads need to work
-          std::swap(m_output[idx], m_output[rev_idx]);
+          std::swap(m_output[m_stride * idx], m_output[m_stride * rev_idx]);
         }
-      } else {                           // out of place calculation
-        m_output[idx] = m_op_a[rev_idx]; // set index value
+      } else {                                                 // out of place calculation
+        m_output[m_stride * idx] = m_op_a[m_stride * rev_idx]; // set index value
       }
     }
   }
@@ -212,7 +283,47 @@ class VectorOpTask : public TaskBase
   void slice()
   {
     for (uint64_t i = 0; i < m_nof_operations; ++i) {
-      m_output[i] = m_op_a[i * m_stride];
+      m_output[i * m_stride_out] = m_op_a[i * m_stride];
+    }
+  }
+
+  // Function to perform modulus with Mersenne number
+  uint64_t mersenne_mod(uint64_t shifted_idx, uint32_t total_bits)
+  {
+    uint64_t mod = (1ULL << total_bits) - 1;
+    shifted_idx = (shifted_idx & mod) + (shifted_idx >> total_bits);
+    while (shifted_idx >= mod) {
+      shifted_idx = (shifted_idx & mod) + (shifted_idx >> total_bits);
+    }
+    return shifted_idx;
+  }
+
+  // Single worker functionality to execute replace elements
+  void replace_elements()
+  {
+    const uint32_t total_bits = m_log_nof_rows + m_log_nof_cols;
+    for (uint32_t i = 0; i < m_nof_operations; ++i) {
+      uint64_t start_idx = (*m_start_indices_in_mat)[m_start_index + i];
+      uint64_t idx = start_idx;
+      T prev = m_op_a[m_stride * idx];
+      do {
+        uint64_t shifted_idx = idx << m_log_nof_rows;
+        uint64_t new_idx = mersenne_mod(shifted_idx, total_bits);
+        T next = m_op_a[m_stride * new_idx];
+        m_output[m_stride * new_idx] = prev;
+        prev = next;
+        idx = new_idx;
+      } while (idx != start_idx);
+    }
+  }
+
+  // Single worker functionality for out of place matrix transpose
+  void out_of_place_transpose()
+  {
+    for (uint32_t k = 0; k < m_nof_operations; ++k) {
+      for (uint32_t j = 0; j < m_nof_cols; ++j) {
+        m_output[m_stride * (j * m_nof_rows + k)] = m_op_a[m_stride * (k * m_nof_cols + j)];
+      }
     }
   }
 
@@ -223,27 +334,41 @@ class VectorOpTask : public TaskBase
     &VectorOpTask::vector_sub,              // VECTOR_SUB,
     &VectorOpTask::vector_mul,              // VECTOR_MUL,
     &VectorOpTask::vector_div,              // VECTOR_DIV,
+    &VectorOpTask::convert_to_montgomery,   // CONVERT_TO_MONTGOMERY,
+    &VectorOpTask::convert_from_montgomery, // CONVERT_FROM_MONTGOMERY,
     &VectorOpTask::vector_sum,              // VECTOR_SUM
     &VectorOpTask::vector_product,          // VECTOR_PRODUCT
     &VectorOpTask::scalar_add_vec,          // SCALAR_ADD_VEC,
     &VectorOpTask::scalar_sub_vec,          // SCALAR_SUB_VEC,
     &VectorOpTask::scalar_mul_vec,          // SCALAR_MUL_VEC,
-    &VectorOpTask::convert_to_montgomery,   // CONVERT_TO_MONTGOMERY,
-    &VectorOpTask::convert_from_montgomery, // CONVERT_FROM_MONTGOMERY,
     &VectorOpTask::bit_reverse,             // BIT_REVERSE
-    &VectorOpTask::slice                    // SLICE
+    &VectorOpTask::slice,                   // SLICE
+    &VectorOpTask::replace_elements,        // REPLACE_ELEMENTS
+    &VectorOpTask::out_of_place_transpose   // OUT_OF_PLACE_MATRIX_TRANSPOSE
+
   };
 
-  VecOperation m_operation; // the operation to execute
-  int m_nof_operations;     // number of operations to execute for this task
-  const T* m_op_a;          // pointer to operand A. Operand A is a vector.
-  const T* m_op_b;          // pointer to operand B. Operand B is a vector or scalar
-  uint64_t m_start_index;   // index used in bitreverse
-  int m_bit_size;           // use in bitrev operation
-  uint64_t m_stride;        // used in slice operation
-  T* m_output;              // pointer to the output. Can be a vector or scalar pointer
-  T m_intermidiate_res;     // pointer to the output. Can be a vector or scalar pointer
-};
+  VecOperation m_operation;  // the operation to execute
+  uint32_t m_nof_operations; // number of operations to execute for this task
+  const T* m_op_a;           // pointer to operand A. Operand A is a vector, or matrix in case of replace_elements
+  const T* m_op_b;           // pointer to operand B. Operand B is a vector or scalar
+  uint64_t m_start_index;    // index used in bitreverse operation and out of place matrix transpose
+  uint64_t m_stop_index;     // index used in reduce operations and out of place matrix transpose
+  uint32_t m_bit_size;       // use in bitrev operation
+  uint64_t m_stride;         // used to support column batch operations
+  uint64_t m_stride_out;     // used in slice operation
+  T*
+    m_output; // pointer to the output. Can be a vector, scalar pointer, or a matrix pointer in case of replace_elements
+  uint32_t m_log_nof_rows; // log of the number of rows in the matrix, used in replace_elements
+  uint32_t m_log_nof_cols; // log of the number of columns in the matrix, used in replace_elements
+  uint32_t m_nof_rows;     // the number of rows in the matrix, used in out of place matrix transpose
+  uint32_t m_nof_cols;     // the number of columns in the matrix, used in out of place matrix transpose
+  const std::vector<uint64_t>* m_start_indices_in_mat; // Indices used in replace_elements operations
+
+public:
+  T m_intermidiate_res;    // pointer to the output. Can be a vector or scalar pointer
+  uint64_t m_idx_in_batch; // index in the batch. Used in intermediate res tasks
+};                         // class VectorOpTask
 
 #define NOF_OPERATIONS_PER_TASK 512
 #define CONFIG_NOF_THREADS_KEY  "n_threads"
@@ -260,12 +385,14 @@ int get_nof_workers(const VecOpsConfig& config)
 // Execute a full task from the type vector = vector (op) vector
 template <typename T>
 eIcicleError
-cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output)
+cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
-  for (uint64_t i = 0; i < n; i += NOF_OPERATIONS_PER_TASK) {
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
+  const uint64_t total_nof_operations = size * config.batch_size;
+  for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) {
     VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
-    task_p->send_2ops_task(op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - i), vec_a + i, vec_b + i, output + i);
+    task_p->send_2ops_task(
+      op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), vec_a + i, vec_b + i, 1, output + i);
   }
   task_manager.wait_done();
   return eIcicleError::SUCCESS;
@@ -274,12 +401,19 @@ cpu_2vectors_op(VecOperation op, const T* vec_a, const T* vec_b, uint64_t n, con
 // Execute a full task from the type vector = scalar (op) vector
 template <typename T>
 eIcicleError cpu_scalar_vector_op(
-  VecOperation op, const T* scalar_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output)
+  VecOperation op, const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
-  for (uint64_t i = 0; i < n; i += NOF_OPERATIONS_PER_TASK) {
-    VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
-    task_p->send_2ops_task(op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - i), scalar_a, vec_b + i, output + i);
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
+  const uint64_t total_nof_operations = size;
+  const uint32_t stride = config.columns_batch ? config.batch_size : 1;
+  for (uint32_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
+    for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) {
+      VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
+      task_p->send_2ops_task(
+        op, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), scalar_a + idx_in_batch,
+        config.columns_batch ? vec_b + idx_in_batch + i * config.batch_size : vec_b + idx_in_batch * size + i, stride,
+        config.columns_batch ? output + idx_in_batch + i * config.batch_size : output + idx_in_batch * size + i);
+    }
   }
   task_manager.wait_done();
   return eIcicleError::SUCCESS;
@@ -287,11 +421,12 @@ eIcicleError cpu_scalar_vector_op(
 
 ///////////////////////////////////////////////////////
 // Functions to register at the CPU backend
+/*********************************** ADD ***********************************/
 template <typename T>
-eIcicleError
-cpu_vector_add(const Device& device, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output)
+eIcicleError cpu_vector_add(
+  const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  return cpu_2vectors_op(VecOperation::VECTOR_ADD, vec_a, vec_b, n, config, output);
+  return cpu_2vectors_op(VecOperation::VECTOR_ADD, vec_a, vec_b, size, config, output);
 }
 
 REGISTER_VECTOR_ADD_BACKEND("CPU", cpu_vector_add<scalar_t>);
@@ -299,113 +434,149 @@ REGISTER_VECTOR_ADD_BACKEND("CPU", cpu_vector_add<scalar_t>);
 /*********************************** ACCUMULATE ***********************************/
 template <typename T>
 eIcicleError
-cpu_vector_accumulate(const Device& device, T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config)
+cpu_vector_accumulate(const Device& device, T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config)
 {
-  for (uint64_t i = 0; i < n; ++i) {
-    vec_a[i] = vec_a[i] + vec_b[i];
-  }
-  return eIcicleError::SUCCESS;
+  return cpu_2vectors_op(VecOperation::VECTOR_ADD, vec_a, vec_b, size, config, vec_a);
 }
 
 REGISTER_VECTOR_ACCUMULATE_BACKEND("CPU", cpu_vector_accumulate<scalar_t>);
 
 /*********************************** SUB ***********************************/
 template <typename T>
-eIcicleError
-cpu_vector_sub(const Device& device, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output)
+eIcicleError cpu_vector_sub(
+  const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  return cpu_2vectors_op(VecOperation::VECTOR_SUB, vec_a, vec_b, n, config, output);
+  return cpu_2vectors_op(VecOperation::VECTOR_SUB, vec_a, vec_b, size, config, output);
 }
 
 REGISTER_VECTOR_SUB_BACKEND("CPU", cpu_vector_sub<scalar_t>);
 
 /*********************************** MUL ***********************************/
 template <typename T>
-eIcicleError
-cpu_vector_mul(const Device& device, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output)
+eIcicleError cpu_vector_mul(
+  const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  return cpu_2vectors_op(VecOperation::VECTOR_MUL, vec_a, vec_b, n, config, output);
+  return cpu_2vectors_op(VecOperation::VECTOR_MUL, vec_a, vec_b, size, config, output);
 }
 
 REGISTER_VECTOR_MUL_BACKEND("CPU", cpu_vector_mul<scalar_t>);
 
 /*********************************** DIV ***********************************/
 template <typename T>
-eIcicleError
-cpu_vector_div(const Device& device, const T* vec_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output)
+eIcicleError cpu_vector_div(
+  const Device& device, const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  return cpu_2vectors_op(VecOperation::VECTOR_DIV, vec_a, vec_b, n, config, output);
+  return cpu_2vectors_op(VecOperation::VECTOR_DIV, vec_a, vec_b, size, config, output);
 }
 
 REGISTER_VECTOR_DIV_BACKEND("CPU", cpu_vector_div<scalar_t>);
 
-/*********************************** SUM ***********************************/
+/*********************************** CONVERT MONTGOMERY ***********************************/
 template <typename T>
-eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output)
+eIcicleError cpu_convert_montgomery(
+  const Device& device, const T* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, T* output)
 {
-  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
-  bool output_initialized = false;
-  uint64_t vec_s_offset = 0;
-  VectorOpTask<T>* task_p;
-  // run until all vector deployed and all tasks completed
-  do {
-    task_p = vec_s_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task();
-    if (task_p->is_completed()) {
-      *output = output_initialized ? task_p->m_intermidiate_res : *output + task_p->m_intermidiate_res;
-    }
-    if (vec_s_offset < n) {
-      task_p->send_intermidiate_res_task(
-        VecOperation::VECTOR_SUM, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_s_offset), vec_a + vec_s_offset);
-      vec_s_offset += NOF_OPERATIONS_PER_TASK;
-    }
-  } while (task_p != nullptr);
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
+  const uint64_t total_nof_operations = size * config.batch_size;
+  for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) {
+    VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
+    task_p->send_1op_task(
+      (is_to_montgomery ? CONVERT_TO_MONTGOMERY : CONVERT_FROM_MONTGOMERY),
+      std::min((uint64_t)NOF_OPERATIONS_PER_TASK, total_nof_operations - i), input + i, output + i);
+  }
+  task_manager.wait_done();
+  for (uint64_t i = 0; i < size * config.batch_size; i++) {}
   return eIcicleError::SUCCESS;
 }
 
-// Once backend will support - uncomment the following line
-// REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum<scalar_t>);
+REGISTER_CONVERT_MONTGOMERY_BACKEND("CPU", cpu_convert_montgomery<scalar_t>);
+
 /*********************************** SUM ***********************************/
+
 template <typename T>
-eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t n, const VecOpsConfig& config, T* output)
+eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
-  bool output_initialized = false;
-  uint64_t vec_s_offset = 0;
-  VectorOpTask<T>* task_p;
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
+  std::vector<bool> output_initialized = std::vector<bool>(config.batch_size, false);
+  uint64_t vec_a_offset = 0;
+  uint64_t idx_in_batch = 0;
   // run until all vector deployed and all tasks completed
-  do {
-    task_p = vec_s_offset < n ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task();
+  while (true) {
+    VectorOpTask<T>* task_p =
+      vec_a_offset < size ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task();
+    if (task_p == nullptr) { return eIcicleError::SUCCESS; }
     if (task_p->is_completed()) {
-      *output = output_initialized ? task_p->m_intermidiate_res : *output * task_p->m_intermidiate_res;
+      output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch]
+                                         ? output[task_p->m_idx_in_batch] + task_p->m_intermidiate_res
+                                         : task_p->m_intermidiate_res;
+      output_initialized[task_p->m_idx_in_batch] = true;
     }
-    if (vec_s_offset < n) {
+    if (vec_a_offset < size) {
+      task_p->m_idx_in_batch = idx_in_batch;
       task_p->send_intermidiate_res_task(
-        VecOperation::VECTOR_SUM, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - vec_s_offset), vec_a + vec_s_offset);
-      vec_s_offset += NOF_OPERATIONS_PER_TASK;
+        VecOperation::VECTOR_SUM, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - vec_a_offset),
+        config.columns_batch ? vec_a + idx_in_batch + vec_a_offset * config.batch_size
+                             : vec_a + idx_in_batch * size + vec_a_offset,
+        config.columns_batch ? config.batch_size : 1);
+      idx_in_batch++;
+      if (idx_in_batch == config.batch_size) {
+        vec_a_offset += NOF_OPERATIONS_PER_TASK;
+        idx_in_batch = 0;
+      }
+    } else {
+      task_p->set_idle();
     }
-  } while (task_p != nullptr);
-  return eIcicleError::SUCCESS;
+  }
 }
 
-// Once backend will support - uncomment the following line
-// REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum<scalar_t>);
+REGISTER_VECTOR_SUM_BACKEND("CPU", cpu_vector_sum<scalar_t>);
 
-/*********************************** MUL BY SCALAR***********************************/
+/*********************************** PRODUCT ***********************************/
 template <typename T>
-eIcicleError cpu_scalar_mul(
-  const Device& device, const T* scalar_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output)
+eIcicleError
+cpu_vector_product(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  return cpu_scalar_vector_op(VecOperation::SCALAR_MUL_VEC, scalar_a, vec_b, n, config, output);
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
+  std::vector<bool> output_initialized = std::vector<bool>(config.batch_size, false);
+  uint64_t vec_a_offset = 0;
+  uint64_t idx_in_batch = 0;
+  // run until all vector deployed and all tasks completed
+  while (true) {
+    VectorOpTask<T>* task_p =
+      vec_a_offset < size ? task_manager.get_idle_or_completed_task() : task_manager.get_completed_task();
+    if (task_p == nullptr) { return eIcicleError::SUCCESS; }
+    if (task_p->is_completed()) {
+      output[task_p->m_idx_in_batch] = output_initialized[task_p->m_idx_in_batch]
+                                         ? output[task_p->m_idx_in_batch] * task_p->m_intermidiate_res
+                                         : task_p->m_intermidiate_res;
+      output_initialized[task_p->m_idx_in_batch] = true;
+    }
+    if (vec_a_offset < size) {
+      task_p->m_idx_in_batch = idx_in_batch;
+      task_p->send_intermidiate_res_task(
+        VecOperation::VECTOR_PRODUCT, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - vec_a_offset),
+        config.columns_batch ? vec_a + idx_in_batch + vec_a_offset * config.batch_size
+                             : vec_a + idx_in_batch * size + vec_a_offset,
+        config.columns_batch ? config.batch_size : 1);
+      idx_in_batch++;
+      if (idx_in_batch == config.batch_size) {
+        vec_a_offset += NOF_OPERATIONS_PER_TASK;
+        idx_in_batch = 0;
+      }
+    } else {
+      task_p->set_idle();
+    }
+  }
 }
 
-REGISTER_SCALAR_MUL_VEC_BACKEND("CPU", cpu_scalar_mul<scalar_t>);
+REGISTER_VECTOR_PRODUCT_BACKEND("CPU", cpu_vector_product<scalar_t>);
 
 /*********************************** Scalar + Vector***********************************/
 template <typename T>
 eIcicleError cpu_scalar_add(
-  const Device& device, const T* scalar_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output)
+  const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  return cpu_scalar_vector_op(VecOperation::SCALAR_ADD_VEC, scalar_a, vec_b, n, config, output);
+  return cpu_scalar_vector_op(VecOperation::SCALAR_ADD_VEC, scalar_a, vec_b, size, config, output);
 }
 
 REGISTER_SCALAR_ADD_VEC_BACKEND("CPU", cpu_scalar_add<scalar_t>);
@@ -413,57 +584,149 @@ REGISTER_SCALAR_ADD_VEC_BACKEND("CPU", cpu_scalar_add<scalar_t>);
 /*********************************** Scalar - Vector***********************************/
 template <typename T>
 eIcicleError cpu_scalar_sub(
-  const Device& device, const T* scalar_a, const T* vec_b, uint64_t n, const VecOpsConfig& config, T* output)
+  const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  return cpu_scalar_vector_op(VecOperation::SCALAR_SUB_VEC, scalar_a, vec_b, n, config, output);
+  return cpu_scalar_vector_op(VecOperation::SCALAR_SUB_VEC, scalar_a, vec_b, size, config, output);
 }
 
 REGISTER_SCALAR_SUB_VEC_BACKEND("CPU", cpu_scalar_sub<scalar_t>);
 
-/*********************************** CONVERT MONTGOMERY ***********************************/
+/*********************************** MUL BY SCALAR***********************************/
 template <typename T>
-eIcicleError cpu_convert_montgomery(
-  const Device& device, const T* input, uint64_t n, bool is_into, const VecOpsConfig& config, T* output)
+eIcicleError cpu_scalar_mul(
+  const Device& device, const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
-  for (uint64_t i = 0; i < n; i += NOF_OPERATIONS_PER_TASK) {
-    VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
-    task_p->send_1op_task(
-      is_into ? CONVERT_TO_MONTGOMERY : CONVERT_FROM_MONTGOMERY, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, n - i),
-      input + i, output + i);
-  }
-  task_manager.wait_done();
-  return eIcicleError::SUCCESS;
+  return cpu_scalar_vector_op(VecOperation::SCALAR_MUL_VEC, scalar_a, vec_b, size, config, output);
 }
 
-REGISTER_CONVERT_MONTGOMERY_BACKEND("CPU", cpu_convert_montgomery<scalar_t>);
-
-#ifdef EXT_FIELD
-REGISTER_VECTOR_ADD_EXT_FIELD_BACKEND("CPU", cpu_vector_add<extension_t>);
-REGISTER_VECTOR_ACCUMULATE_EXT_FIELD_BACKEND("CPU", cpu_vector_accumulate<extension_t>);
-REGISTER_VECTOR_SUB_EXT_FIELD_BACKEND("CPU", cpu_vector_sub<extension_t>);
-REGISTER_VECTOR_MUL_EXT_FIELD_BACKEND("CPU", cpu_vector_mul<extension_t>);
-REGISTER_CONVERT_MONTGOMERY_EXT_FIELD_BACKEND("CPU", cpu_convert_montgomery<extension_t>);
-#endif // EXT_FIELD
+REGISTER_SCALAR_MUL_VEC_BACKEND("CPU", cpu_scalar_mul<scalar_t>);
 
 /*********************************** TRANSPOSE ***********************************/
+
 template <typename T>
-eIcicleError cpu_matrix_transpose(
+eIcicleError out_of_place_matrix_transpose(
   const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out)
 {
-  // Check for invalid arguments
-  if (!mat_in || !mat_out || nof_rows == 0 || nof_cols == 0) { return eIcicleError::INVALID_ARGUMENT; }
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
+  uint32_t stride = config.columns_batch ? config.batch_size : 1;
+  const uint64_t total_elements_one_mat = static_cast<uint64_t>(nof_rows) * nof_cols;
+  const uint32_t NOF_ROWS_PER_TASK =
+    std::min((uint64_t)nof_rows, std::max((uint64_t)(NOF_OPERATIONS_PER_TASK / nof_cols), (uint64_t)1));
+  for (uint32_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
+    const T* cur_mat_in = config.columns_batch ? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements_one_mat;
+    T* cur_mat_out = config.columns_batch ? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements_one_mat;
+    // Perform the matrix transpose
+    for (uint32_t i = 0; i < nof_rows; i += NOF_ROWS_PER_TASK) {
+      VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
+      task_p->send_out_of_place_matrix_transpose_task(
+        OUT_OF_PLACE_MATRIX_TRANSPOSE, cur_mat_in + stride * i * nof_cols,
+        std::min((uint64_t)NOF_ROWS_PER_TASK, (uint64_t)nof_rows - i), nof_rows, nof_cols, stride,
+        cur_mat_out + (stride * i));
+    }
+  }
+  task_manager.wait_done();
+  return eIcicleError::SUCCESS;
+}
 
-  // Perform the matrix transpose
-  for (uint32_t i = 0; i < nof_rows; ++i) {
-    for (uint32_t j = 0; j < nof_cols; ++j) {
-      mat_out[j * nof_rows + i] = mat_in[i * nof_cols + j];
+uint32_t gcd(uint32_t a, uint32_t b)
+{
+  while (b != 0) {
+    uint32_t temp = b;
+    b = a % b;
+    a = temp;
+  }
+  return a;
+}
+
+// Recursive function to generate all k-ary necklaces and to replace the elements within the necklaces
+template <typename T>
+void gen_necklace(
+  uint32_t t,
+  uint32_t p,
+  uint32_t k,
+  uint32_t length,
+  std::vector<uint32_t>& necklace,
+  std::vector<uint64_t>& task_indices)
+{
+  if (t > length) {
+    if (
+      length % p == 0 &&
+      !std::all_of(necklace.begin() + 1, necklace.begin() + length + 1, [first_element = necklace[1]](uint32_t x) {
+        return x == first_element;
+      })) {
+      uint32_t start_idx = 0;
+      uint64_t multiplier = 1;
+      for (int i = length; i >= 1; --i) { // Compute start_idx as the decimal representation of the necklace
+        start_idx += necklace[i] * multiplier;
+        multiplier *= k;
+      }
+      task_indices.push_back(start_idx);
     }
+    return;
   }
 
+  necklace[t] = necklace[t - p];
+  gen_necklace<T>(t + 1, p, k, length, necklace, task_indices);
+
+  for (int i = necklace[t - p] + 1; i < k; ++i) {
+    necklace[t] = i;
+    gen_necklace<T>(t + 1, t, k, length, necklace, task_indices);
+  }
+}
+
+template <typename T>
+eIcicleError matrix_transpose_necklaces(
+  const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out)
+{
+  uint32_t log_nof_rows = static_cast<uint32_t>(std::floor(std::log2(nof_rows)));
+  uint32_t log_nof_cols = static_cast<uint32_t>(std::floor(std::log2(nof_cols)));
+  uint32_t gcd_value = gcd(log_nof_rows, log_nof_cols);
+  uint32_t k = 1 << gcd_value; // Base of necklaces
+  uint32_t length =
+    (log_nof_cols + log_nof_rows) / gcd_value; // length of necklaces. Since all are powers of 2, equivalent to
+                                               // (log_nof_cols + log_nof_rows) / gcd_value;
+  const uint64_t max_nof_operations = NOF_OPERATIONS_PER_TASK / length;
+  const uint64_t total_elements_one_mat = static_cast<uint64_t>(nof_rows) * nof_cols;
+
+  std::vector<uint32_t> necklace(length + 1, 0);
+  std::vector<uint64_t> start_indices_in_mat; // Collect start indices
+  gen_necklace<T>(1, 1, k, length, necklace, start_indices_in_mat);
+
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
+  for (uint64_t i = 0; i < start_indices_in_mat.size(); i += max_nof_operations) {
+    uint64_t nof_operations = std::min((uint64_t)max_nof_operations, start_indices_in_mat.size() - i);
+    for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
+      VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
+      task_p->send_replace_elements_task(
+        REPLACE_ELEMENTS, config.columns_batch ? mat_in + idx_in_batch : mat_in + idx_in_batch * total_elements_one_mat,
+        nof_operations, start_indices_in_mat, i, log_nof_rows, log_nof_cols,
+        config.columns_batch ? config.batch_size : 1,
+        config.columns_batch ? mat_out + idx_in_batch : mat_out + idx_in_batch * total_elements_one_mat);
+    }
+  }
+  task_manager.wait_done();
   return eIcicleError::SUCCESS;
 }
 
+template <typename T>
+eIcicleError cpu_matrix_transpose(
+  const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out)
+{
+  ICICLE_ASSERT(mat_in && mat_out && nof_rows != 0 && nof_cols != 0) << "Invalid argument";
+
+  // check if the number of rows and columns are powers of 2, if not use the basic transpose
+  bool is_power_of_2 = (nof_rows & (nof_rows - 1)) == 0 && (nof_cols & (nof_cols - 1)) == 0;
+  bool is_inplace = mat_in == mat_out;
+  if (!is_inplace) {
+    return (out_of_place_matrix_transpose(device, mat_in, nof_rows, nof_cols, config, mat_out));
+  } else if (is_power_of_2) {
+    return (matrix_transpose_necklaces<T>(mat_in, nof_rows, nof_cols, config, mat_out));
+  } else {
+    ICICLE_LOG_ERROR << "Matrix transpose is not supported for inplace non power of 2 rows and columns";
+    return eIcicleError::INVALID_ARGUMENT;
+  }
+}
+
 REGISTER_MATRIX_TRANSPOSE_BACKEND("CPU", cpu_matrix_transpose<scalar_t>);
 #ifdef EXT_FIELD
 REGISTER_MATRIX_TRANSPOSE_EXT_FIELD_BACKEND("CPU", cpu_matrix_transpose<extension_t>);
@@ -474,21 +737,23 @@ template <typename T>
 eIcicleError
 cpu_bit_reverse(const Device& device, const T* vec_in, uint64_t size, const VecOpsConfig& config, T* vec_out)
 {
-  // Check for invalid arguments
-  if (!vec_in || !vec_out || size == 0) { return eIcicleError::INVALID_ARGUMENT; }
+  ICICLE_ASSERT(vec_in && vec_out && size != 0) << "Invalid argument";
 
-  // Calculate log2(size)
-  int logn = static_cast<int>(std::floor(std::log2(size)));
-  if ((1ULL << logn) != size) {
-    return eIcicleError::INVALID_ARGUMENT; // Ensure size is a power of 2
-  }
+  uint32_t logn = static_cast<uint32_t>(std::floor(std::log2(size)));
+  ICICLE_ASSERT((1ULL << logn) == size) << "Invalid argument - size is not a power of 2";
 
   // Perform the bit reverse
-  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
-  for (uint64_t i = 0; i < size; i += NOF_OPERATIONS_PER_TASK) {
-    VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
-    task_p->send_bitrev_task(
-      BIT_REVERSE, logn, i, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - i), vec_in, vec_out);
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
+  for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
+    for (uint64_t i = 0; i < size; i += NOF_OPERATIONS_PER_TASK) {
+      VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
+
+      task_p->send_bit_reverse_task(
+        BIT_REVERSE, logn, i, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - i),
+        config.columns_batch ? vec_in + idx_in_batch : vec_in + idx_in_batch * size,
+        config.columns_batch ? config.batch_size : 1,
+        config.columns_batch ? vec_out + idx_in_batch : vec_out + idx_in_batch * size);
+    }
   }
   task_manager.wait_done();
   return eIcicleError::SUCCESS;
@@ -507,20 +772,25 @@ eIcicleError cpu_slice(
   const T* vec_in,
   uint64_t offset,
   uint64_t stride,
-  uint64_t size,
+  uint64_t size_in,
+  uint64_t size_out,
   const VecOpsConfig& config,
   T* vec_out)
 {
-  if (vec_in == nullptr || vec_out == nullptr) {
-    ICICLE_LOG_ERROR << "Error: Invalid argument - input or output vector is null";
-    return eIcicleError::INVALID_ARGUMENT;
-  }
-
-  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config));
-  for (uint64_t i = 0; i < size; i += NOF_OPERATIONS_PER_TASK) {
-    VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
-    task_p->send_slice_task(
-      SLICE, stride, std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size - i), vec_in + offset + i * stride, vec_out + i);
+  ICICLE_ASSERT(vec_in != nullptr && vec_out != nullptr) << "Error: Invalid argument - input or output vector is null";
+  ICICLE_ASSERT(offset + (size_out - 1) * stride < size_in) << "Error: Invalid argument - slice out of bound";
+
+  TasksManager<VectorOpTask<T>> task_manager(get_nof_workers(config) - 1);
+  for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
+    for (uint64_t i = 0; i < size_out; i += NOF_OPERATIONS_PER_TASK) {
+      VectorOpTask<T>* task_p = task_manager.get_idle_or_completed_task();
+      task_p->send_slice_task(
+        SLICE, config.columns_batch ? stride * config.batch_size : stride, config.columns_batch ? config.batch_size : 1,
+        std::min((uint64_t)NOF_OPERATIONS_PER_TASK, size_out - i),
+        config.columns_batch ? vec_in + idx_in_batch + (offset + i * stride) * config.batch_size
+                             : vec_in + idx_in_batch * size_in + offset + i * stride,
+        config.columns_batch ? vec_out + idx_in_batch + i * config.batch_size : vec_out + idx_in_batch * size_out + i);
+    }
   }
   task_manager.wait_done();
   return eIcicleError::SUCCESS;
@@ -531,6 +801,29 @@ REGISTER_SLICE_BACKEND("CPU", cpu_slice<scalar_t>);
 REGISTER_SLICE_EXT_FIELD_BACKEND("CPU", cpu_slice<extension_t>);
 #endif // EXT_FIELD
 
+/*********************************** Highest non-zero idx ***********************************/
+template <typename T>
+eIcicleError cpu_highest_non_zero_idx(
+  const Device& device, const T* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx /*OUT*/)
+{
+  ICICLE_ASSERT(input && out_idx && size != 0) << "Error: Invalid argument";
+  uint64_t stride = config.columns_batch ? config.batch_size : 1;
+  for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) {
+    out_idx[idx_in_batch] = -1; // zero vector is considered '-1' since 0 would be zero in vec[0]
+    const T* curr_input =
+      config.columns_batch ? input + idx_in_batch : input + idx_in_batch * size; // Pointer to the current vector
+    for (int64_t i = size - 1; i >= 0; --i) {
+      if (curr_input[i * stride] != T::zero()) {
+        out_idx[idx_in_batch] = i;
+        break;
+      }
+    }
+  }
+  return eIcicleError::SUCCESS;
+}
+
+REGISTER_HIGHEST_NON_ZERO_IDX_BACKEND("CPU", cpu_highest_non_zero_idx<scalar_t>);
+
 /*********************************** Polynomial evaluation ***********************************/
 
 template <typename T>
@@ -543,12 +836,19 @@ eIcicleError cpu_poly_eval(
   const VecOpsConfig& config,
   T* evals /*OUT*/)
 {
+  ICICLE_ASSERT(coeffs && domain && evals && coeffs_size != 0 && domain_size != 0) << "Error: Invalid argument";
   // using Horner's method
   // example: ax^2+bx+c is computed as (1) r=a, (2) r=r*x+b, (3) r=r*x+c
-  for (uint64_t eval_idx = 0; eval_idx < domain_size; ++eval_idx) {
-    evals[eval_idx] = coeffs[coeffs_size - 1];
-    for (int64_t coeff_idx = coeffs_size - 2; coeff_idx >= 0; --coeff_idx) {
-      evals[eval_idx] = evals[eval_idx] * domain[eval_idx] + coeffs[coeff_idx];
+  uint64_t stride = config.columns_batch ? config.batch_size : 1;
+  for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) {
+    const T* curr_coeffs = config.columns_batch ? coeffs + idx_in_batch : coeffs + idx_in_batch * coeffs_size;
+    T* curr_evals = config.columns_batch ? evals + idx_in_batch : evals + idx_in_batch * domain_size;
+    for (uint64_t eval_idx = 0; eval_idx < domain_size; ++eval_idx) {
+      curr_evals[eval_idx * stride] = curr_coeffs[(coeffs_size - 1) * stride];
+      for (int64_t coeff_idx = coeffs_size - 2; coeff_idx >= 0; --coeff_idx) {
+        curr_evals[eval_idx * stride] =
+          curr_evals[eval_idx * stride] * domain[eval_idx] + curr_coeffs[coeff_idx * stride];
+      }
     }
   }
   return eIcicleError::SUCCESS;
@@ -556,38 +856,21 @@ eIcicleError cpu_poly_eval(
 
 REGISTER_POLYNOMIAL_EVAL("CPU", cpu_poly_eval<scalar_t>);
 
-/*********************************** Highest non-zero idx ***********************************/
-template <typename T>
-eIcicleError cpu_highest_non_zero_idx(
-  const Device& device, const T* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx /*OUT*/)
-{
-  *out_idx = -1; // zero vector is considered '-1' since 0 would be zero in vec[0]
-  for (int64_t i = size - 1; i >= 0; --i) {
-    if (input[i] != T::zero()) {
-      *out_idx = i;
-      break;
-    }
-  }
-  return eIcicleError::SUCCESS;
-}
-
-REGISTER_HIGHEST_NON_ZERO_IDX_BACKEND("CPU", cpu_highest_non_zero_idx<scalar_t>);
-
 /*============================== polynomial division ==============================*/
 template <typename T>
-void school_book_division_step_cpu(T* r, T* q, const T* b, int deg_r, int deg_b, const T& lc_b_inv)
+void school_book_division_step_cpu(T* r, T* q, const T* b, int deg_r, int deg_b, const T& lc_b_inv, uint32_t stride = 1)
 {
   int64_t monomial = deg_r - deg_b; // monomial=1 is 'x', monomial=2 is x^2 etc.
 
-  T lc_r = r[deg_r];
+  T lc_r = r[deg_r * stride];         // leading coefficient of r
   T monomial_coeff = lc_r * lc_b_inv; // lc_r / lc_b
 
   // adding monomial s to q (q=q+s)
-  q[monomial] = monomial_coeff;
+  q[monomial * stride] = monomial_coeff;
 
   for (int i = monomial; i <= deg_r; ++i) {
-    T b_coeff = b[i - monomial];
-    r[i] = r[i] - monomial_coeff * b_coeff;
+    T b_coeff = b[(i - monomial) * stride];
+    r[i * stride] = r[i * stride] - monomial_coeff * b_coeff;
   }
 }
 
@@ -595,36 +878,65 @@ template <typename T>
 eIcicleError cpu_poly_divide(
   const Device& device,
   const T* numerator,
-  int64_t numerator_deg,
-  const T* denumerator,
-  int64_t denumerator_deg,
+  uint64_t numerator_size,
+  const T* denominator,
+  uint64_t denominator_size,
   const VecOpsConfig& config,
   T* q_out /*OUT*/,
   uint64_t q_size,
   T* r_out /*OUT*/,
   uint64_t r_size)
 {
-  ICICLE_ASSERT(r_size >= numerator_deg)
-    << "polynomial division expects r(x) size to be similar to numerator size and higher than numerator degree(x)";
-  ICICLE_ASSERT(q_size >= (numerator_deg - denumerator_deg + 1))
-    << "polynomial division expects q(x) size to be at least deg(numerator)-deg(denumerator)+1";
-
-  ICICLE_CHECK(icicle_copy_async(r_out, numerator, r_size * sizeof(T), config.stream));
-
-  // invert largest coeff of b
-  const T& lc_b_inv = T::inverse(denumerator[denumerator_deg]);
-
-  int64_t deg_r = numerator_deg;
-  while (deg_r >= denumerator_deg) {
-    // each iteration is removing the largest monomial in r until deg(r)<deg(b)
-    school_book_division_step_cpu(r_out, q_out, denumerator, deg_r, denumerator_deg, lc_b_inv);
-
-    // compute degree of r
-    auto degree_config = default_vec_ops_config();
-    cpu_highest_non_zero_idx(device, r_out, deg_r + 1 /*size of R*/, degree_config, &deg_r);
+  if (config.batch_size != 1 && config.columns_batch) {
+    ICICLE_LOG_ERROR << "polynomial division is not implemented for column batch. Planned for v3.2";
+    return eIcicleError::API_NOT_IMPLEMENTED;
   }
 
+  uint32_t stride = config.columns_batch ? config.batch_size : 1;
+  for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) {
+    const T* curr_numerator =
+      config.columns_batch ? numerator + idx_in_batch : numerator + idx_in_batch * numerator_size;
+    const T* curr_denominator =
+      config.columns_batch ? denominator + idx_in_batch : denominator + idx_in_batch * denominator_size;
+    T* curr_q_out = config.columns_batch ? q_out + idx_in_batch : q_out + idx_in_batch * q_size;
+    T* curr_r_out = config.columns_batch ? r_out + idx_in_batch : r_out + idx_in_batch * r_size;
+
+    int64_t numerator_deg, denominator_deg;
+    cpu_highest_non_zero_idx(device, curr_numerator, numerator_size, default_vec_ops_config(), &numerator_deg);
+    cpu_highest_non_zero_idx(device, curr_denominator, denominator_size, default_vec_ops_config(), &denominator_deg);
+    ICICLE_ASSERT(r_size >= numerator_deg + 1)
+      << "polynomial division expects r(x) size to be similar to numerator size and higher than numerator degree(x)";
+    ICICLE_ASSERT(q_size >= (numerator_deg - denominator_deg + 1))
+      << "polynomial division expects q(x) size to be at least deg(numerator)-deg(denominator)+1";
+
+    memset(curr_r_out, 0, sizeof(T) * r_size);
+    memcpy(curr_r_out, curr_numerator, sizeof(T) * (numerator_deg + 1));
+
+    // invert largest coeff of b
+    const T& lc_b_inv = T::inverse(curr_denominator[denominator_deg * stride]);
+    int64_t deg_r = numerator_deg;
+    while (deg_r >= denominator_deg) {
+      // each iteration is removing the largest monomial in r until deg(r)<deg(b)
+      school_book_division_step_cpu(curr_r_out, curr_q_out, curr_denominator, deg_r, denominator_deg, lc_b_inv, stride);
+      // compute degree of r
+      cpu_highest_non_zero_idx(device, r_out, deg_r, default_vec_ops_config(), &deg_r);
+    }
+  }
   return eIcicleError::SUCCESS;
 }
 
-REGISTER_POLYNOMIAL_DIVISION("CPU", cpu_poly_divide<scalar_t>);
\ No newline at end of file
+REGISTER_POLYNOMIAL_DIVISION("CPU", cpu_poly_divide<scalar_t>);
+
+#ifdef EXT_FIELD
+REGISTER_VECTOR_ADD_EXT_FIELD_BACKEND("CPU", cpu_vector_add<extension_t>);
+REGISTER_VECTOR_ACCUMULATE_EXT_FIELD_BACKEND("CPU", cpu_vector_accumulate<extension_t>);
+REGISTER_VECTOR_SUB_EXT_FIELD_BACKEND("CPU", cpu_vector_sub<extension_t>);
+REGISTER_VECTOR_MUL_EXT_FIELD_BACKEND("CPU", cpu_vector_mul<extension_t>);
+REGISTER_VECTOR_DIV_EXT_FIELD_BACKEND("CPU", cpu_vector_div<extension_t>);
+REGISTER_CONVERT_MONTGOMERY_EXT_FIELD_BACKEND("CPU", cpu_convert_montgomery<extension_t>);
+REGISTER_VECTOR_SUM_EXT_FIELD_BACKEND("CPU", cpu_vector_sum<extension_t>);
+REGISTER_VECTOR_PRODUCT_EXT_FIELD_BACKEND("CPU", cpu_vector_product<extension_t>);
+REGISTER_SCALAR_MUL_VEC_EXT_FIELD_BACKEND("CPU", cpu_scalar_mul<extension_t>);
+REGISTER_SCALAR_ADD_VEC_EXT_FIELD_BACKEND("CPU", cpu_scalar_add<extension_t>);
+REGISTER_SCALAR_SUB_VEC_EXT_FIELD_BACKEND("CPU", cpu_scalar_sub<extension_t>);
+#endif // EXT_FIELD
\ No newline at end of file
diff --git a/icicle/include/icicle/backend/vec_ops_backend.h b/icicle/include/icicle/backend/vec_ops_backend.h
index 8ee0c0a15..3739fb780 100644
--- a/icicle/include/icicle/backend/vec_ops_backend.h
+++ b/icicle/include/icicle/backend/vec_ops_backend.h
@@ -7,16 +7,72 @@ using namespace field_config;
 namespace icicle {
   /*************************** Backend registration ***************************/
 
+  using vectorVectorOpImplInplaceA = std::function<eIcicleError(
+    const Device& device, scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config)>;
+
+  using scalarConvertMontgomeryImpl = std::function<eIcicleError(
+    const Device& device,
+    const scalar_t* input,
+    uint64_t size,
+    bool is_to_montgomery,
+    const VecOpsConfig& config,
+    scalar_t* output)>;
+
+  using VectorReduceOpImpl = std::function<eIcicleError(
+    const Device& device, const scalar_t* vec_a, uint64_t size, const VecOpsConfig& config, scalar_t* output)>;
+
   using scalarVectorOpImpl = std::function<eIcicleError(
     const Device& device,
-    const scalar_t* vec_a,
+    const scalar_t* scalar_a,
     const scalar_t* vec_b,
-    uint64_t n,
+    uint64_t size,
     const VecOpsConfig& config,
     scalar_t* output)>;
 
-  using scalarVectorOpImplInplaceA = std::function<eIcicleError(
-    const Device& device, scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config)>;
+  using scalarMatrixOpImpl = std::function<eIcicleError(
+    const Device& device,
+    const scalar_t* in,
+    uint32_t nof_rows,
+    uint32_t nof_cols,
+    const VecOpsConfig& config,
+    scalar_t* out)>;
+
+  using scalarBitReverseOpImpl = std::function<eIcicleError(
+    const Device& device, const scalar_t* input, uint64_t size, const VecOpsConfig& config, scalar_t* output)>;
+
+  using scalarSliceOpImpl = std::function<eIcicleError(
+    const Device& device,
+    const scalar_t* input,
+    uint64_t offset,
+    uint64_t stride,
+    uint64_t size_in,
+    uint64_t size_out,
+    const VecOpsConfig& config,
+    scalar_t* output)>;
+
+  using scalarHighNonZeroIdxOpImpl = std::function<eIcicleError(
+    const Device& device, const scalar_t* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx)>;
+
+  using scalarPolyEvalImpl = std::function<eIcicleError(
+    const Device& device,
+    const scalar_t* coeffs,
+    uint64_t coeffs_size,
+    const scalar_t* domain,
+    uint64_t domain_size,
+    const VecOpsConfig& config,
+    scalar_t* evals /*OUT*/)>;
+
+  using scalarPolyDivImpl = std::function<eIcicleError(
+    const Device& device,
+    const scalar_t* numerator,
+    uint64_t numerator_size,
+    const scalar_t* denominator,
+    uint64_t denominator_size,
+    const VecOpsConfig& config,
+    scalar_t* q_out /*OUT*/,
+    uint64_t q_size,
+    scalar_t* r_out /*OUT*/,
+    uint64_t r_size)>;
 
   void register_vector_add(const std::string& deviceType, scalarVectorOpImpl impl);
 
@@ -28,7 +84,7 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  void register_vector_accumulate(const std::string& deviceType, scalarVectorOpImplInplaceA impl);
+  void register_vector_accumulate(const std::string& deviceType, vectorVectorOpImplInplaceA impl);
 
 #define REGISTER_VECTOR_ACCUMULATE_BACKEND(DEVICE_TYPE, FUNC)                                                          \
   namespace {                                                                                                          \
@@ -67,6 +123,36 @@ namespace icicle {
     }();                                                                                                               \
   }
 
+  void register_scalar_convert_montgomery(const std::string& deviceType, scalarConvertMontgomeryImpl);
+
+#define REGISTER_CONVERT_MONTGOMERY_BACKEND(DEVICE_TYPE, FUNC)                                                         \
+  namespace {                                                                                                          \
+    static bool UNIQUE(_reg_scalar_convert_mont) = []() -> bool {                                                      \
+      register_scalar_convert_montgomery(DEVICE_TYPE, FUNC);                                                           \
+      return true;                                                                                                     \
+    }();                                                                                                               \
+  }
+
+  void register_vector_sum(const std::string& deviceType, VectorReduceOpImpl impl);
+
+#define REGISTER_VECTOR_SUM_BACKEND(DEVICE_TYPE, FUNC)                                                                 \
+  namespace {                                                                                                          \
+    static bool UNIQUE(_reg_vec_sum) = []() -> bool {                                                                  \
+      register_vector_sum(DEVICE_TYPE, FUNC);                                                                          \
+      return true;                                                                                                     \
+    }();                                                                                                               \
+  }
+
+  void register_vector_product(const std::string& deviceType, VectorReduceOpImpl impl);
+
+#define REGISTER_VECTOR_PRODUCT_BACKEND(DEVICE_TYPE, FUNC)                                                             \
+  namespace {                                                                                                          \
+    static bool UNIQUE(_reg_vec_product) = []() -> bool {                                                              \
+      register_vector_product(DEVICE_TYPE, FUNC);                                                                      \
+      return true;                                                                                                     \
+    }();                                                                                                               \
+  }
+
   void register_scalar_mul_vec(const std::string& deviceType, scalarVectorOpImpl impl);
 
 #define REGISTER_SCALAR_MUL_VEC_BACKEND(DEVICE_TYPE, FUNC)                                                             \
@@ -97,32 +183,6 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  using scalarConvertMontgomeryImpl = std::function<eIcicleError(
-    const Device& device,
-    const scalar_t* input,
-    uint64_t size,
-    bool is_into,
-    const VecOpsConfig& config,
-    scalar_t* output)>;
-
-  void register_scalar_convert_montgomery(const std::string& deviceType, scalarConvertMontgomeryImpl);
-
-#define REGISTER_CONVERT_MONTGOMERY_BACKEND(DEVICE_TYPE, FUNC)                                                         \
-  namespace {                                                                                                          \
-    static bool UNIQUE(_reg_scalar_convert_mont) = []() -> bool {                                                      \
-      register_scalar_convert_montgomery(DEVICE_TYPE, FUNC);                                                           \
-      return true;                                                                                                     \
-    }();                                                                                                               \
-  }
-
-  using scalarMatrixOpImpl = std::function<eIcicleError(
-    const Device& device,
-    const scalar_t* in,
-    uint32_t nof_rows,
-    uint32_t nof_cols,
-    const VecOpsConfig& config,
-    scalar_t* out)>;
-
   void register_matrix_transpose(const std::string& deviceType, scalarMatrixOpImpl impl);
 
 #define REGISTER_MATRIX_TRANSPOSE_BACKEND(DEVICE_TYPE, FUNC)                                                           \
@@ -133,9 +193,6 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  using scalarBitReverseOpImpl = std::function<eIcicleError(
-    const Device& device, const scalar_t* input, uint64_t size, const VecOpsConfig& config, scalar_t* output)>;
-
   void register_scalar_bit_reverse(const std::string& deviceType, scalarBitReverseOpImpl);
 
 #define REGISTER_BIT_REVERSE_BACKEND(DEVICE_TYPE, FUNC)                                                                \
@@ -146,15 +203,6 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  using scalarSliceOpImpl = std::function<eIcicleError(
-    const Device& device,
-    const scalar_t* input,
-    uint64_t offset,
-    uint64_t stride,
-    uint64_t size,
-    const VecOpsConfig& config,
-    scalar_t* output)>;
-
   void register_slice(const std::string& deviceType, scalarSliceOpImpl);
 
 #define REGISTER_SLICE_BACKEND(DEVICE_TYPE, FUNC)                                                                      \
@@ -165,9 +213,6 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  using scalarHighNonZeroIdxOpImpl = std::function<eIcicleError(
-    const Device& device, const scalar_t* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx /*OUT*/)>;
-
   void register_highest_non_zero_idx(const std::string& deviceType, scalarHighNonZeroIdxOpImpl);
 
 #define REGISTER_HIGHEST_NON_ZERO_IDX_BACKEND(DEVICE_TYPE, FUNC)                                                       \
@@ -178,24 +223,6 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  template <typename T>
-  eIcicleError polynomial_eval(
-    const T* coeffs,
-    uint64_t coeffs_size,
-    const T* domain,
-    uint64_t domain_size,
-    const VecOpsConfig& config,
-    T* evals /*OUT*/);
-
-  using scalarPolyEvalImpl = std::function<eIcicleError(
-    const Device& device,
-    const scalar_t* coeffs,
-    uint64_t coeffs_size,
-    const scalar_t* domain,
-    uint64_t domain_size,
-    const VecOpsConfig& config,
-    scalar_t* evals /*OUT*/)>;
-
   void register_poly_eval(const std::string& deviceType, scalarPolyEvalImpl);
 
 #define REGISTER_POLYNOMIAL_EVAL(DEVICE_TYPE, FUNC)                                                                    \
@@ -206,18 +233,6 @@ namespace icicle {
     }();                                                                                                               \
   }
 
-  using scalarPolyDivImpl = std::function<eIcicleError(
-    const Device& device,
-    const scalar_t* numerator,
-    int64_t numerator_deg,
-    const scalar_t* denumerator,
-    int64_t denumerator_deg,
-    const VecOpsConfig& config,
-    scalar_t* q_out /*OUT*/,
-    uint64_t q_size,
-    scalar_t* r_out /*OUT*/,
-    uint64_t r_size)>;
-
   void register_poly_division(const std::string& deviceType, scalarPolyDivImpl);
 
 #define REGISTER_POLYNOMIAL_DIVISION(DEVICE_TYPE, FUNC)                                                                \
@@ -233,12 +248,23 @@ namespace icicle {
     const Device& device,
     const extension_t* vec_a,
     const extension_t* vec_b,
-    uint64_t n,
+    uint64_t size,
     const VecOpsConfig& config,
     extension_t* output)>;
 
   using extFieldVectorOpImplInplaceA = std::function<eIcicleError(
-    const Device& device, extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig& config)>;
+    const Device& device, extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config)>;
+
+  using extFieldVectorReduceOpImpl = std::function<eIcicleError(
+    const Device& device, const extension_t* vec_a, uint64_t size, const VecOpsConfig& config, extension_t* output)>;
+
+  using extFieldVectorOpImpl = std::function<eIcicleError(
+    const Device& device,
+    const extension_t* scalar_a,
+    const extension_t* vec_b,
+    uint64_t size,
+    const VecOpsConfig& config,
+    extension_t* output)>;
 
   void register_extension_vector_add(const std::string& deviceType, extFieldVectorOpImpl impl);
 
@@ -279,11 +305,71 @@ namespace icicle {
       }();                                                                                                             \
     }
 
+  void register_extension_vector_div(const std::string& deviceType, extFieldVectorOpImpl impl);
+
+  #define REGISTER_VECTOR_DIV_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC)                                                     \
+    namespace {                                                                                                        \
+      static bool UNIQUE(_reg_vec_div_ext_field) = []() -> bool {                                                      \
+        register_extension_vector_div(DEVICE_TYPE, FUNC);                                                              \
+        return true;                                                                                                   \
+      }();                                                                                                             \
+    }
+
+  void register_extension_scalar_mul_vec(const std::string& deviceType, extFieldVectorOpImpl impl);
+
+  #define REGISTER_SCALAR_MUL_VEC_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC)                                                 \
+    namespace {                                                                                                        \
+      static bool UNIQUE(_reg_scalar_mul_vec_ext_field) = []() -> bool {                                               \
+        register_extension_scalar_mul_vec(DEVICE_TYPE, FUNC);                                                          \
+        return true;                                                                                                   \
+      }();                                                                                                             \
+    }
+
+  void register_extension_scalar_add_vec(const std::string& deviceType, extFieldVectorOpImpl impl);
+
+  #define REGISTER_SCALAR_ADD_VEC_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC)                                                 \
+    namespace {                                                                                                        \
+      static bool UNIQUE(_reg_scalar_add_vec_ext_field) = []() -> bool {                                               \
+        register_extension_scalar_add_vec(DEVICE_TYPE, FUNC);                                                          \
+        return true;                                                                                                   \
+      }();                                                                                                             \
+    }
+
+  void register_extension_scalar_sub_vec(const std::string& deviceType, extFieldVectorOpImpl impl);
+
+  #define REGISTER_SCALAR_SUB_VEC_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC)                                                 \
+    namespace {                                                                                                        \
+      static bool UNIQUE(_reg_scalar_sub_vec_ext_field) = []() -> bool {                                               \
+        register_extension_scalar_sub_vec(DEVICE_TYPE, FUNC);                                                          \
+        return true;                                                                                                   \
+      }();                                                                                                             \
+    }
+
+  void register_extension_vector_sum(const std::string& deviceType, extFieldVectorReduceOpImpl impl);
+
+  #define REGISTER_VECTOR_SUM_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC)                                                     \
+    namespace {                                                                                                        \
+      static bool UNIQUE(_reg_vec_sum_ext_field) = []() -> bool {                                                      \
+        register_extension_vector_sum(DEVICE_TYPE, FUNC);                                                              \
+        return true;                                                                                                   \
+      }();                                                                                                             \
+    }
+
+  void register_extension_vector_product(const std::string& deviceType, extFieldVectorReduceOpImpl impl);
+
+  #define REGISTER_VECTOR_PRODUCT_EXT_FIELD_BACKEND(DEVICE_TYPE, FUNC)                                                 \
+    namespace {                                                                                                        \
+      static bool UNIQUE(_reg_vec_product_ext_field) = []() -> bool {                                                  \
+        register_extension_vector_product(DEVICE_TYPE, FUNC);                                                          \
+        return true;                                                                                                   \
+      }();                                                                                                             \
+    }
+
   using extFieldConvertMontgomeryImpl = std::function<eIcicleError(
     const Device& device,
     const extension_t* input,
     uint64_t size,
-    bool is_into,
+    bool is_to_montgomery,
     const VecOpsConfig& config,
     extension_t* output)>;
 
@@ -333,7 +419,8 @@ namespace icicle {
     const extension_t* input,
     uint64_t offset,
     uint64_t stride,
-    uint64_t size,
+    uint64_t size_in,
+    uint64_t size_out,
     const VecOpsConfig& config,
     extension_t* output)>;
 
diff --git a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
index f0643f978..ef59f816f 100644
--- a/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
+++ b/icicle/include/icicle/polynomials/default_backend/default_poly_backend.h
@@ -65,7 +65,7 @@ namespace icicle {
       config.is_async = true;
       config.stream = m_stream;
 
-      ICICLE_CHECK(icicle::slice(in_coeffs, offset, stride, out_size, config, out_coeffs));
+      ICICLE_CHECK(icicle::slice(in_coeffs, offset, stride, in_size, out_size, config, out_coeffs));
     }
 
     void add_sub(PolyContext& res, PolyContext a, PolyContext b, bool add1_sub0)
@@ -278,7 +278,7 @@ namespace icicle {
       config.is_result_on_device = true;
 
       ICICLE_CHECK(icicle::polynomial_division(
-        a_coeffs, deg_a, b_coeffs, deg_b, config, Q_coeffs, deg_a - deg_b + 1, R_coeffs, a_N));
+        a_coeffs, deg_a + 1, b_coeffs, deg_b + 1, config, Q_coeffs, deg_a - deg_b + 1, R_coeffs, a_N));
     }
 
     void quotient(PolyContext Q, PolyContext op_a, PolyContext op_b) override
@@ -546,8 +546,8 @@ namespace icicle {
         config.is_result_on_device = true;
         config.is_async = true;
         config.stream = m_stream;
-        ICICLE_CHECK(
-          icicle::slice(get_context_storage_immutable<I>(p), 0 /*offset*/, stride, domain_size, config, d_evals));
+        ICICLE_CHECK(icicle::slice(
+          get_context_storage_immutable<I>(p), 0 /*offset*/, stride, poly_size, domain_size, config, d_evals));
       } else {
         ICICLE_CHECK(icicle_memset(d_evals, 0, domain_size * sizeof(I)));
         auto ntt_config = default_ntt_config<D>();
diff --git a/icicle/include/icicle/utils/modifiers.h b/icicle/include/icicle/utils/modifiers.h
index a8728d279..b652e9829 100644
--- a/icicle/include/icicle/utils/modifiers.h
+++ b/icicle/include/icicle/utils/modifiers.h
@@ -33,4 +33,4 @@
 #else
   #define LONG_CONST_SUFFIX(x) x##L
   #define PACKED(x)            x __attribute__((packed))
-#endif
\ No newline at end of file
+#endif
diff --git a/icicle/include/icicle/vec_ops.h b/icicle/include/icicle/vec_ops.h
index b23cd0a4b..38551ab6a 100644
--- a/icicle/include/icicle/vec_ops.h
+++ b/icicle/include/icicle/vec_ops.h
@@ -17,17 +17,22 @@ namespace icicle {
    * @note APIs with a single input, ignore input b.
    */
   struct VecOpsConfig {
-    icicleStreamHandle stream; /**< Stream for asynchronous execution. */
-    bool is_a_on_device;       /**< True if `a` is on the device, false if it is not. Default value: false. */
-    bool is_b_on_device;       /**< True if `b` is on the device, false if it is not. Default value: false. OPTIONAL. */
-    bool is_result_on_device; /**< If true, the output is preserved on the device, otherwise on the host. Default value:
-                                                             false. */
-    bool is_async;            /**< Whether to run the vector operations asynchronously.
-                                                       If set to `true`, the function will be non-blocking and synchronization
-                                                       must be explicitly managed using `cudaStreamSynchronize` or
-                                 `cudaDeviceSynchronize`.            If set to `false`, the function will block the current CPU
-                                 thread. */
-    ConfigExtension* ext = nullptr; /**< Backend-specific extension. */
+    icicleStreamHandle stream; /** Stream for asynchronous execution. */
+    bool is_a_on_device;       /** True if `a` is on the device, false if it is not. Default value: false. */
+    bool is_b_on_device;       /** True if `b` is on the device, false if it is not. Default value: false. OPTIONAL. */
+    bool is_result_on_device;  /** If true, the output is preserved on the device, otherwise on the host. Default value:
+                                   false. */
+    bool is_async;             /** Whether to run the vector operations asynchronously.
+                                   If set to `true`, the function will be non-blocking and synchronization
+                                   must be explicitly managed using `cudaStreamSynchronize` or `cudaDeviceSynchronize`.
+                                   If set to `false`, the function will block the current CPU thread. */
+    int batch_size;            /** Number of vectors (or operations) to process in a batch.
+                                   Each vector operation will be performed independently on each batch element.
+                                   Default value: 1. */
+    bool columns_batch; /** True if the batched vectors are stored as columns in a 2D array (i.e., the vectors are
+                           strided in memory as columns of a matrix). If false, the batched vectors are stored
+                           contiguously in memory (e.g., as rows or in a flat array). Default value: false. */
+    ConfigExtension* ext = nullptr; /** Backend-specific extension. */
   };
 
   /**
@@ -43,6 +48,8 @@ namespace icicle {
       false,   // is_b_on_device
       false,   // is_result_on_device
       false,   // is_async
+      1,       // batch_size
+      false,   // columns_batch
     };
     return config;
   }
@@ -53,11 +60,17 @@ namespace icicle {
    * @brief Adds two vectors element-wise.
    *
    * @tparam T Type of the elements in the vectors.
-   * @param vec_a Input vector `a`.
-   * @param vec_b Input vector `b`.
-   * @param size Number of elements in the vectors.
+   * @param vec_a Pointer to the first input vector(s).
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously in memory.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param vec_b Pointer to the second input vector(s).
+   *              - The storage layout should match that of `vec_a`.
+   * @param size Number of elements in each vector.
    * @param config Configuration for the operation.
-   * @param output Output vector to store the result.
+   * @param output Pointer to the output vector(s) where the results will be stored.
+   *               The output array should have the same storage layout as the input vectors.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
@@ -67,24 +80,36 @@ namespace icicle {
    * @brief Accumulates the elements of two vectors element-wise and stores the result in the first vector.
    *
    * @tparam T Type of the elements in the vectors.
-   * @param vec_a Input/output vector `a`. The result will be written back to this vector.
-   * @param vec_b Input vector `b`.
-   * @param size Number of elements in the vectors.
+   * @param vec_a Pointer to the first Input/output vector(s). The result will be written back to this vector.
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously in memory.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param vec_b Pointer to the second input vector(s).
+   *              - The storage layout should match that of `vec_a`.
+   * @param size Number of elements in each vector.
    * @param config Configuration for the operation.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
-  eIcicleError vector_accumulate(T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config);
+  eIcicleError
+  vector_accumulate(T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config); // use vector_add (inplace)
 
   /**
    * @brief Subtracts vector `b` from vector `a` element-wise.
    *
    * @tparam T Type of the elements in the vectors.
-   * @param vec_a Input vector `a`.
-   * @param vec_b Input vector `b`.
-   * @param size Number of elements in the vectors.
+   * @param vec_a Pointer to the first input vector(s).
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously in memory.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param vec_b Pointer to the second input vector(s).
+   *              - The storage layout should match that of `vec_a`.
+   * @param size Number of elements in each vector.
    * @param config Configuration for the operation.
-   * @param output Output vector to store the result.
+   * @param output Pointer to the output vector(s) where the results will be stored.
+   *               The output array should have the same storage layout as the input vectors.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
@@ -94,11 +119,17 @@ namespace icicle {
    * @brief Multiplies two vectors element-wise.
    *
    * @tparam T Type of the elements in the vectors.
-   * @param vec_a Input vector `a`.
-   * @param vec_b Input vector `b`.
-   * @param size Number of elements in the vectors.
+   * @param vec_a Pointer to the first input vector(s).
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously in memory.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param vec_b Pointer to the second input vector(s).
+   *              - The storage layout should match that of `vec_a`.
+   * @param size Number of elements in each vector.
    * @param config Configuration for the operation.
-   * @param output Output vector to store the result.
+   * @param output Pointer to the output vector(s) where the results will be stored.
+   *               The output array should have the same storage layout as the input vectors.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
@@ -108,11 +139,17 @@ namespace icicle {
    * @brief Divides vector `a` by vector `b` element-wise.
    *
    * @tparam T Type of the elements in the vectors.
-   * @param vec_a Input vector `a`.
-   * @param vec_b Input vector `b`.
-   * @param size Number of elements in the vectors.
+   * @param vec_a Pointer to the first input vector(s).
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously in memory.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param vec_b Pointer to the second input vector(s).
+   *              - The storage layout should match that of `vec_a`.
+   * @param size Number of elements in each vector.
    * @param config Configuration for the operation.
-   * @param output Output vector to store the result.
+   * @param output Pointer to the output vector(s) where the results will be stored.
+   *               The output array should have the same storage layout as the input vectors.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
@@ -122,15 +159,59 @@ namespace icicle {
    * @brief Converts elements to and from Montgomery form.
    *
    * @tparam T Type of the elements.
-   * @param input Input vector.
-   * @param size Number of elements in the input vector.
-   * @param is_into True to convert into Montgomery form, false to convert out of Montgomery form.
+   * @param input Pointer to the input vector(s).
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously in memory.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param size Number of elements in each vector.
+   * @param is_to_montgomery True to convert into Montgomery form, false to convert out of Montgomery form.
+   * @param config Configuration for the operation.
+   * @param output Pointer to the output vector(s) where the results will be stored.
+   *               The output array should have the same storage layout as the input vectors.
+   * @return eIcicleError Error code indicating success or failure.
+   */
+  template <typename T>
+  eIcicleError
+  convert_montgomery(const T* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, T* output);
+
+  // Reduction operations
+
+  /**
+   * @brief Computes the sum of all elements in each vector in a batch.
+   *
+   * @tparam T Type of the elements in the vector.
+   * @param vec_a Pointer to the input vector(s).
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param size Number of elements in each vector.
+   * @param config Configuration for the operation.
+   * @param output Pointer to the output array where the results will be stored.
+   * @return eIcicleError Error code indicating success or failure.
+   */
+
+  template <typename T>
+  eIcicleError vector_sum(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output);
+
+  /**
+   * @brief Computes the product of all elements in each vector in the batch.
+   *
+   * @tparam T Type of the elements in the vectors.
+   * @param vec_a Pointer to the input vector(s).
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param size Number of elements in each vector.
    * @param config Configuration for the operation.
-   * @param output Output vector to store the result.
+   * @param output Pointer to the output array where the results will be stored.
    * @return eIcicleError Error code indicating success or failure.
    */
+
   template <typename T>
-  eIcicleError convert_montgomery(const T* input, uint64_t size, bool is_into, const VecOpsConfig& config, T* output);
+  eIcicleError vector_product(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output);
 
   // Scalar-Vector operations
 
@@ -138,12 +219,17 @@ namespace icicle {
    * @brief Adds a scalar to each element of a vector.
    *
    * @tparam T Type of the elements in the vector and the scalar.
-   * @param scalar_a Input scalar.
-   * @param vec_b Input vector.
-   * @param size Number of elements in the vector.
+   * @param scalar_a Pointer to the input scalar(s).
+   * @param vec_b Pointer to the input vector(s).
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param size Number of elements in a vector.
    * @param config Configuration for the operation.
-   * @param output Output vector to store the result.
+   * @param output Pointer to the output vector(s) where the results will be stored.
    * @return eIcicleError Error code indicating success or failure.
+   * @note To subtract a scalar from each element of a vector - use scalar_add_vec with negative scalar.
    */
   template <typename T>
   eIcicleError scalar_add_vec(const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
@@ -152,12 +238,17 @@ namespace icicle {
    * @brief Subtracts each element of a vector from a scalar, elementwise (res[i]=scalar-vec[i]).
    *
    * @tparam T Type of the elements in the vector and the scalar.
-   * @param scalar_a Input scalar.
-   * @param vec_b Input vector.
-   * @param size Number of elements in the vector.
+   * @param scalar_a Pointer to Input scalar(s).
+   * @param vec_b Pointer to the input vector(s).
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param size Number of elements in a vector.
    * @param config Configuration for the operation.
-   * @param output Output vector to store the result.
+   * @param output Pointer to the output vector(s) where the results will be stored.
    * @return eIcicleError Error code indicating success or failure.
+   * @note To subtract a scalar from each element of a vector - use scalar_add_vec with negative scalar.
    */
   template <typename T>
   eIcicleError scalar_sub_vec(const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
@@ -166,11 +257,15 @@ namespace icicle {
    * @brief Multiplies each element of a vector by a scalar.
    *
    * @tparam T Type of the elements in the vector and the scalar.
-   * @param scalar_a Input scalar.
-   * @param vec_b Input vector.
-   * @param size Number of elements in the vector.
+   * @param scalar_a Pointer to Input scalar(s).
+   * @param vec_b Pointer to the input vector(s).
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param size Number of elements in a vector.
    * @param config Configuration for the operation.
-   * @param output Output vector to store the result.
+   * @param output Pointer to the output vector(s) where the results will be stored.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
@@ -182,12 +277,15 @@ namespace icicle {
    * @brief Transposes a matrix.
    *
    * @tparam T Type of the elements in the matrix.
-   * @param mat_in Input matrix.
-   * @param nof_rows Number of rows in the input matrix.
-   * @param nof_cols Number of columns in the input matrix.
+   * @param mat_in Pointer to the input matrix or matrices.
+   * @param nof_rows Number of rows in each input matrix.
+   * @param nof_cols Number of columns in each input matrix.
    * @param config Configuration for the operation.
-   * @param mat_out Output matrix to store the result.
+   * @param mat_out Pointer to the output matrix or matrices where the transposed matrices will be stored.
    * @return eIcicleError Error code indicating success or failure.
+   * @note The input matrices are assumed to be stored in row-major order.
+   *       This function transposes an input matrix or a batch of matrices.
+   *       Matrix transpose inplace is not supported for non-power of 2 rows and columns.
    */
   template <typename T>
   eIcicleError
@@ -196,42 +294,65 @@ namespace icicle {
   // Miscellaneous operations
 
   /**
-   * @brief Reorders the vector elements based on bit-reverse. That is out[i]=in[bitrev[i]].
+   * @brief Reorders the vector (or batch of vectors) elements based on bit-reverse. That is out[i]=in[bitrev[i]].
    *
    * @tparam T Type of the elements in the vector.
-   * @param vec_in Input vector.
-   * @param size Number of elements in the input vector.
+   * @param vec_in Pointer to the input vector(s).
+   *              - If `config.batch_size > 1`, this should be a concatenated array of vectors.
+   *              - The layout depends on `config.columns_batch`:
+   *                - If `false`, vectors are stored contiguously.
+   *                - If `true`, vectors are stored as columns in a 2D array.
+   * @param size Number of elements in each vector. Must be a power of 2.
    * @param config Configuration for the operation.
-   * @param vec_out Output vector to store the result.
+   * @param vec_out Pointer to the output vector(s) where the results will be stored.
+   *                The output array should have the same storage layout as the input vectors.
    * @return eIcicleError Error code indicating success or failure.
+   * @note If `vec_in` and `vec_out` point to the same memory location, the operation is performed in-place.
    */
   template <typename T>
   eIcicleError bit_reverse(const T* vec_in, uint64_t size, const VecOpsConfig& config, T* vec_out);
 
   /**
-   * @brief Extracts a slice from a vector.
+   * @brief Extracts a slice from a vector or batch of vectors.
    *
    * @tparam T Type of the elements in the vector.
-   * @param vec_in Input vector.
-   * @param offset Offset from which to start the slice.
+   * @param vec_in Pointer to the input vector(s).
+   * @param offset Offset from which to start the slice in each vector.
    * @param stride Stride between elements in the slice.
-   * @param size Number of elements in the slice.
+   * @param size_in Number of elements in one input vector.
+   * @param size_out Number of elements in one input vector.
    * @param config Configuration for the operation.
-   * @param vec_out Output vector to store the result.
+   * @param vec_out Pointer to the output vector(s) where the results will be stored.
+   *                The output array should have the same storage layout as the input vectors.
    * @return eIcicleError Error code indicating success or failure.
+   * @note The total input size is `size_in * config.batch_size`.
+   *       The total output size is `size_out * config.batch_size`.
+   *       parameters must satisfy: offset + (size_out-1) * stride < size_in
    */
   template <typename T>
+  eIcicleError slice(
+    const T* vec_in,
+    uint64_t offset,
+    uint64_t stride,
+    uint64_t size_in,
+    uint64_t size_out,
+    const VecOpsConfig& config,
+    T* vec_out);
+
+  // Deprecated slice API
+  template <typename T>
   eIcicleError
-  slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size, const VecOpsConfig& config, T* vec_out);
+  slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size_out, const VecOpsConfig& config, T* vec_out);
 
   /**
-   * @brief Finds the highest non-zero index in a vector.
+   * @brief Finds the highest non-zero index in a vector or batch of vectors.
    *
    * @tparam T Type of the elements in the vector.
-   * @param vec_in Input vector.
-   * @param size Number of elements in the input vector.
+   * @param vec_in Pointer to the input vector(s).
+   * @param size Number of elements in each input vector.
    * @param config Configuration for the operation.
-   * @param out_idx Output index of the highest non-zero element.
+   * @param out_idx Pointer to an array where the output indices of the highest non-zero element in each input vector
+   * will be stored. The array should have a length of `config.batch_size`.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
@@ -241,12 +362,21 @@ namespace icicle {
    * @brief Evaluates a polynomial at given domain points.
    *
    * @tparam T Type of the elements in the polynomial and domain.
-   * @param coeffs Pointer to the array of coefficients of the polynomial.
-   * @param coeffs_size Number of coefficients in the polynomial.
-   * @param domain Pointer to the array of points at which to evaluate the polynomial.
+   * @param coeffs Pointer to the array of coefficients of the polynomial(s).
+   *               - The size of `coeffs` should be `coeffs_size * batch_size`.
+   *               - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored
+   * contiguously.
+   *               - If `config.columns_batch` is `true`, coefficients are interleaved.
+   * @param coeffs_size Number of coefficients in each polynomial.
+   * @param domain Pointer to the array of points at which to evaluate the polynomial(s).
+   *               - The same domain is used for all polynomials.
+   *               - The size of `domain` should be `domain_size`.
    * @param domain_size Number of domain points.
    * @param config Configuration for the operation.
    * @param evals Pointer to the array where the evaluated results will be stored. This is an output parameter.
+   *              - The size of `evals` should be `domain_size * batch_size`.
+   *              - If `config.columns_batch` is `false`, results for each polynomial are stored contiguously.
+   *              - If `config.columns_batch` is `true`, results are interleaved.
    * @return eIcicleError Error code indicating success or failure.
    */
   template <typename T>
@@ -259,26 +389,39 @@ namespace icicle {
     T* evals /*OUT*/);
 
   /**
-   * @brief Divides two polynomials.
+   * @brief Divides two polynomials or batch of couples of polynomials.
    *
    * @tparam T Type of the elements in the polynomials.
-   * @param numerator Pointer to the array of coefficients of the numerator polynomial.
-   * @param numerator_deg Degree of the numerator polynomial.
-   * @param denominator Pointer to the array of coefficients of the denominator polynomial.
-   * @param denominator_deg Degree of the denominator polynomial.
+   * @param numerator Pointer to the array of coefficients of the numerator polynomial(s).
+   *                  - The size of `numerator` should be `(numerator_deg + 1) * batch_size`.
+   *                  - If `config.columns_batch` is `false`, coefficients for each polynomial in the batch are stored
+   * contiguously.
+   *                  - If `config.columns_batch` is `true`, coefficients are interleaved.
+   * @param numerator_size size (number of T elements) in numerator vec of a single batch element
+   * @param denominator Pointer to the array of coefficients of the denominator polynomial(s).
+   *                  - Storage layout is similar to `numerator`.
+   * @param denominator_size size (number of T elements) in denominator vec of a single batch element
    * @param config Configuration for the operation.
-   * @param q_out Pointer to the array where the quotient will be stored. This is an output parameter.
-   * @param q_size Size of the quotient array.
-   * @param r_out Pointer to the array where the remainder will be stored. This is an output parameter.
+   * @param q_out Pointer to the array where the quotient polynomial(s) will be stored. This is an output parameter.
+   *              - The storage layout should match that of `numerator`.
+   * @param q_size Size of the quotient array for one polynomial.
+   * @param r_out Pointer to the array where the remainder polynomial(s) will be stored. This is an output parameter.
+   *              - The storage layout should match that of `numerator`.
+   *              - The size of `r_out` should be sufficient to hold the remainder coefficients for each polynomial.
    * @param r_size Size of the remainder array.
    * @return eIcicleError Error code indicating success or failure.
+   *
+   * @note The degrees should satisfy `numerator_deg >= denominator_deg`.
+   *       The sizes `q_size` and `r_size` must be at least `numerator_deg - denominator_deg + 1` and `denominator_deg`,
+   * respectively. The function assumes that the input and output arrays are properly allocated.
    */
+
   template <typename T>
   eIcicleError polynomial_division(
     const T* numerator,
-    int64_t numerator_deg,
-    const T* denumerator,
-    int64_t denumerator_deg,
+    uint64_t numerator_size,
+    const T* denominator,
+    uint64_t denominator_size,
     const VecOpsConfig& config,
     T* q_out /*OUT*/,
     uint64_t q_size,
diff --git a/icicle/src/vec_ops.cpp b/icicle/src/vec_ops.cpp
index d42fa0dca..6e159074f 100644
--- a/icicle/src/vec_ops.cpp
+++ b/icicle/src/vec_ops.cpp
@@ -3,67 +3,130 @@
 
 namespace icicle {
 
+  /*********************************** REDUCE PRODUCT ************************/
+  ICICLE_DISPATCHER_INST(VectorProductDispatcher, vector_product, VectorReduceOpImpl);
+
+  extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_product)(
+    const scalar_t* vec_a, uint64_t size, const VecOpsConfig* config, scalar_t* output)
+  {
+    return VectorProductDispatcher::execute(vec_a, size, *config, output);
+  }
+
+  template <>
+  eIcicleError vector_product(const scalar_t* vec_a, uint64_t size, const VecOpsConfig& config, scalar_t* output)
+  {
+    return CONCAT_EXPAND(FIELD, vector_product)(vec_a, size, &config, output);
+  }
+
+#ifdef EXT_FIELD
+  ICICLE_DISPATCHER_INST(VectorProductExtFieldDispatcher, extension_vector_product, extFieldVectorReduceOpImpl);
+
+  extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_product)(
+    const extension_t* vec_a, uint64_t size, const VecOpsConfig* config, extension_t* output)
+  {
+    return VectorProductExtFieldDispatcher::execute(vec_a, size, *config, output);
+  }
+
+  template <>
+  eIcicleError vector_product(const extension_t* vec_a, uint64_t size, const VecOpsConfig& config, extension_t* output)
+  {
+    return CONCAT_EXPAND(FIELD, extension_vector_product)(vec_a, size, &config, output);
+  }
+#endif // EXT_FIELD
+
+  /*********************************** REDUCE SUM ****************************/
+  ICICLE_DISPATCHER_INST(VectorSumDispatcher, vector_sum, VectorReduceOpImpl);
+
+  extern "C" eIcicleError
+  CONCAT_EXPAND(FIELD, vector_sum)(const scalar_t* vec_a, uint64_t size, const VecOpsConfig* config, scalar_t* output)
+  {
+    return VectorSumDispatcher::execute(vec_a, size, *config, output);
+  }
+
+  template <>
+  eIcicleError vector_sum(const scalar_t* vec_a, uint64_t size, const VecOpsConfig& config, scalar_t* output)
+  {
+    return CONCAT_EXPAND(FIELD, vector_sum)(vec_a, size, &config, output);
+  }
+
+#ifdef EXT_FIELD
+  ICICLE_DISPATCHER_INST(VectorSumExtFieldDispatcher, extension_vector_sum, extFieldVectorReduceOpImpl);
+
+  extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_sum)(
+    const extension_t* vec_a, uint64_t size, const VecOpsConfig* config, extension_t* output)
+  {
+    return VectorSumExtFieldDispatcher::execute(vec_a, size, *config, output);
+  }
+
+  template <>
+  eIcicleError vector_sum(const extension_t* vec_a, uint64_t size, const VecOpsConfig& config, extension_t* output)
+  {
+    return CONCAT_EXPAND(FIELD, extension_vector_sum)(vec_a, size, &config, output);
+  }
+#endif // EXT_FIELD
+
   /*********************************** ADD ***********************************/
   ICICLE_DISPATCHER_INST(VectorAddDispatcher, vector_add, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_add)(
-    const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output)
   {
-    return VectorAddDispatcher::execute(vec_a, vec_b, n, *config, output);
+    return VectorAddDispatcher::execute(vec_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError
-  vector_add(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output)
+  vector_add(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, vector_add)(vec_a, vec_b, n, &config, output);
+    return CONCAT_EXPAND(FIELD, vector_add)(vec_a, vec_b, size, &config, output);
   }
 
 #ifdef EXT_FIELD
   ICICLE_DISPATCHER_INST(VectorAddExtFieldDispatcher, extension_vector_add, extFieldVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_add)(
-    const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig* config, extension_t* output)
+    const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config, extension_t* output)
   {
-    return VectorAddExtFieldDispatcher::execute(vec_a, vec_b, n, *config, output);
+    return VectorAddExtFieldDispatcher::execute(vec_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError vector_add(
-    const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig& config, extension_t* output)
+    const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config, extension_t* output)
   {
-    return CONCAT_EXPAND(FIELD, extension_vector_add)(vec_a, vec_b, n, &config, output);
+    return CONCAT_EXPAND(FIELD, extension_vector_add)(vec_a, vec_b, size, &config, output);
   }
 #endif // EXT_FIELD
 
   /*********************************** ACCUMULATE ***********************************/
-  ICICLE_DISPATCHER_INST(VectorAccumulateDispatcher, vector_accumulate, scalarVectorOpImplInplaceA);
+  ICICLE_DISPATCHER_INST(VectorAccumulateDispatcher, vector_accumulate, vectorVectorOpImplInplaceA);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_accumulate)(
-    scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config)
+    scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config)
   {
-    return VectorAccumulateDispatcher::execute(vec_a, vec_b, n, *config);
+    return VectorAccumulateDispatcher::execute(vec_a, vec_b, size, *config);
   }
 
   template <>
-  eIcicleError vector_accumulate(scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config)
+  eIcicleError vector_accumulate(scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config)
   {
-    return CONCAT_EXPAND(FIELD, vector_accumulate)(vec_a, vec_b, n, &config);
+    return CONCAT_EXPAND(FIELD, vector_accumulate)(vec_a, vec_b, size, &config);
   }
 
 #ifdef EXT_FIELD
   ICICLE_DISPATCHER_INST(VectorAccumulateExtFieldDispatcher, extension_vector_accumulate, extFieldVectorOpImplInplaceA);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_accumulate)(
-    extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig* config)
+    extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config)
   {
-    return VectorAccumulateExtFieldDispatcher::execute(vec_a, vec_b, n, *config);
+    return VectorAccumulateExtFieldDispatcher::execute(vec_a, vec_b, size, *config);
   }
 
   template <>
-  eIcicleError vector_accumulate(extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig& config)
+  eIcicleError
+  vector_accumulate(extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config)
   {
-    return CONCAT_EXPAND(FIELD, extension_vector_accumulate)(vec_a, vec_b, n, &config);
+    return CONCAT_EXPAND(FIELD, extension_vector_accumulate)(vec_a, vec_b, size, &config);
   }
 #endif // EXT_FIELD
 
@@ -71,32 +134,32 @@ namespace icicle {
   ICICLE_DISPATCHER_INST(VectorSubDispatcher, vector_sub, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_sub)(
-    const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output)
   {
-    return VectorSubDispatcher::execute(vec_a, vec_b, n, *config, output);
+    return VectorSubDispatcher::execute(vec_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError
-  vector_sub(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output)
+  vector_sub(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, vector_sub)(vec_a, vec_b, n, &config, output);
+    return CONCAT_EXPAND(FIELD, vector_sub)(vec_a, vec_b, size, &config, output);
   }
 
 #ifdef EXT_FIELD
   ICICLE_DISPATCHER_INST(VectorSubExtFieldDispatcher, extension_vector_sub, extFieldVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_sub)(
-    const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig* config, extension_t* output)
+    const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config, extension_t* output)
   {
-    return VectorSubExtFieldDispatcher::execute(vec_a, vec_b, n, *config, output);
+    return VectorSubExtFieldDispatcher::execute(vec_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError vector_sub(
-    const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig& config, extension_t* output)
+    const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config, extension_t* output)
   {
-    return CONCAT_EXPAND(FIELD, extension_vector_sub)(vec_a, vec_b, n, &config, output);
+    return CONCAT_EXPAND(FIELD, extension_vector_sub)(vec_a, vec_b, size, &config, output);
   }
 #endif // EXT_FIELD
 
@@ -104,32 +167,32 @@ namespace icicle {
   ICICLE_DISPATCHER_INST(VectorMulDispatcher, vector_mul, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_mul)(
-    const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output)
   {
-    return VectorMulDispatcher::execute(vec_a, vec_b, n, *config, output);
+    return VectorMulDispatcher::execute(vec_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError
-  vector_mul(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output)
+  vector_mul(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, vector_mul)(vec_a, vec_b, n, &config, output);
+    return CONCAT_EXPAND(FIELD, vector_mul)(vec_a, vec_b, size, &config, output);
   }
 
 #ifdef EXT_FIELD
   ICICLE_DISPATCHER_INST(VectorMulExtFieldDispatcher, extension_vector_mul, extFieldVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_mul)(
-    const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig* config, extension_t* output)
+    const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config, extension_t* output)
   {
-    return VectorMulExtFieldDispatcher::execute(vec_a, vec_b, n, *config, output);
+    return VectorMulExtFieldDispatcher::execute(vec_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError vector_mul(
-    const extension_t* vec_a, const extension_t* vec_b, uint64_t n, const VecOpsConfig& config, extension_t* output)
+    const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config, extension_t* output)
   {
-    return CONCAT_EXPAND(FIELD, extension_vector_mul)(vec_a, vec_b, n, &config, output);
+    return CONCAT_EXPAND(FIELD, extension_vector_mul)(vec_a, vec_b, size, &config, output);
   }
 #endif // EXT_FIELD
 
@@ -137,80 +200,172 @@ namespace icicle {
   ICICLE_DISPATCHER_INST(VectorDivDispatcher, vector_div, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, vector_div)(
-    const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output)
   {
-    return VectorDivDispatcher::execute(vec_a, vec_b, n, *config, output);
+    return VectorDivDispatcher::execute(vec_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError
-  vector_div(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output)
+  vector_div(const scalar_t* vec_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, vector_div)(vec_a, vec_b, n, &config, output);
+    return CONCAT_EXPAND(FIELD, vector_div)(vec_a, vec_b, size, &config, output);
   }
 
+#ifdef EXT_FIELD
+  ICICLE_DISPATCHER_INST(VectorDivExtFieldDispatcher, extension_vector_div, extFieldVectorOpImpl);
+
+  extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_vector_div)(
+    const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig* config, extension_t* output)
+  {
+    return VectorDivExtFieldDispatcher::execute(vec_a, vec_b, size, *config, output);
+  }
+
+  template <>
+  eIcicleError vector_div(
+    const extension_t* vec_a, const extension_t* vec_b, uint64_t size, const VecOpsConfig& config, extension_t* output)
+  {
+    return CONCAT_EXPAND(FIELD, extension_vector_div)(vec_a, vec_b, size, &config, output);
+  }
+#endif // EXT_FIELD
+
   /*********************************** (Scalar + Vector) ELEMENT WISE ***********************************/
   ICICLE_DISPATCHER_INST(ScalarAddDispatcher, scalar_add_vec, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_add_vec)(
-    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output)
   {
-    return ScalarAddDispatcher::execute(scalar_a, vec_b, n, *config, output);
+    return ScalarAddDispatcher::execute(scalar_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError scalar_add_vec(
-    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output)
+    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output)
+  {
+    return CONCAT_EXPAND(FIELD, scalar_add_vec)(scalar_a, vec_b, size, &config, output);
+  }
+
+#ifdef EXT_FIELD
+  ICICLE_DISPATCHER_INST(ScalarAddExtFieldDispatcher, extension_scalar_add_vec, extFieldVectorOpImpl);
+
+  extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_scalar_add_vec)(
+    const extension_t* scalar_a,
+    const extension_t* vec_b,
+    uint64_t size,
+    const VecOpsConfig* config,
+    extension_t* output)
   {
-    return CONCAT_EXPAND(FIELD, scalar_add_vec)(scalar_a, vec_b, n, &config, output);
+    return ScalarAddExtFieldDispatcher::execute(scalar_a, vec_b, size, *config, output);
   }
 
+  template <>
+  eIcicleError scalar_add_vec(
+    const extension_t* scalar_a,
+    const extension_t* vec_b,
+    uint64_t size,
+    const VecOpsConfig& config,
+    extension_t* output)
+  {
+    return CONCAT_EXPAND(FIELD, extension_scalar_add_vec)(scalar_a, vec_b, size, &config, output);
+  }
+#endif // EXT_FIELD
+
   /*********************************** (Scalar - Vector) ELEMENT WISE ***********************************/
   ICICLE_DISPATCHER_INST(ScalarSubDispatcher, scalar_sub_vec, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_sub_vec)(
-    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output)
   {
-    return ScalarSubDispatcher::execute(scalar_a, vec_b, n, *config, output);
+    return ScalarSubDispatcher::execute(scalar_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError scalar_sub_vec(
-    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output)
+    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, scalar_sub_vec)(scalar_a, vec_b, n, &config, output);
+    return CONCAT_EXPAND(FIELD, scalar_sub_vec)(scalar_a, vec_b, size, &config, output);
   }
+
+#ifdef EXT_FIELD
+  ICICLE_DISPATCHER_INST(ScalarSubExtFieldDispatcher, extension_scalar_sub_vec, extFieldVectorOpImpl);
+
+  extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_scalar_sub_vec)(
+    const extension_t* scalar_a,
+    const extension_t* vec_b,
+    uint64_t size,
+    const VecOpsConfig* config,
+    extension_t* output)
+  {
+    return ScalarSubExtFieldDispatcher::execute(scalar_a, vec_b, size, *config, output);
+  }
+
+  template <>
+  eIcicleError scalar_sub_vec(
+    const extension_t* scalar_a,
+    const extension_t* vec_b,
+    uint64_t size,
+    const VecOpsConfig& config,
+    extension_t* output)
+  {
+    return CONCAT_EXPAND(FIELD, extension_scalar_sub_vec)(scalar_a, vec_b, size, &config, output);
+  }
+#endif // EXT_FIELD
   /*********************************** MUL BY SCALAR ***********************************/
   ICICLE_DISPATCHER_INST(ScalarMulDispatcher, scalar_mul_vec, scalarVectorOpImpl);
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_mul_vec)(
-    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig* config, scalar_t* output)
+  {
+    return ScalarMulDispatcher::execute(scalar_a, vec_b, size, *config, output);
+  }
+
+  template <>
+  eIcicleError scalar_mul_vec(
+    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t size, const VecOpsConfig& config, scalar_t* output)
+  {
+    return CONCAT_EXPAND(FIELD, scalar_mul_vec)(scalar_a, vec_b, size, &config, output);
+  }
+
+#ifdef EXT_FIELD
+  ICICLE_DISPATCHER_INST(ScalarMulExtFieldDispatcher, extension_scalar_mul_vec, extFieldVectorOpImpl);
+
+  extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_scalar_mul_vec)(
+    const extension_t* scalar_a,
+    const extension_t* vec_b,
+    uint64_t size,
+    const VecOpsConfig* config,
+    extension_t* output)
   {
-    return ScalarMulDispatcher::execute(scalar_a, vec_b, n, *config, output);
+    return ScalarMulExtFieldDispatcher::execute(scalar_a, vec_b, size, *config, output);
   }
 
   template <>
   eIcicleError scalar_mul_vec(
-    const scalar_t* scalar_a, const scalar_t* vec_b, uint64_t n, const VecOpsConfig& config, scalar_t* output)
+    const extension_t* scalar_a,
+    const extension_t* vec_b,
+    uint64_t size,
+    const VecOpsConfig& config,
+    extension_t* output)
   {
-    return CONCAT_EXPAND(FIELD, scalar_mul_vec)(scalar_a, vec_b, n, &config, output);
+    return CONCAT_EXPAND(FIELD, extension_scalar_mul_vec)(scalar_a, vec_b, size, &config, output);
   }
+#endif // EXT_FIELD
 
   /*********************************** CONVERT MONTGOMERY ***********************************/
 
   ICICLE_DISPATCHER_INST(ScalarConvertMontgomeryDispatcher, scalar_convert_montgomery, scalarConvertMontgomeryImpl)
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, scalar_convert_montgomery)(
-    const scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, scalar_t* output)
+    const scalar_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig* config, scalar_t* output)
   {
-    return ScalarConvertMontgomeryDispatcher::execute(input, size, is_into, *config, output);
+    return ScalarConvertMontgomeryDispatcher::execute(input, size, is_to_montgomery, *config, output);
   }
 
   template <>
-  eIcicleError
-  convert_montgomery(const scalar_t* input, uint64_t size, bool is_into, const VecOpsConfig& config, scalar_t* output)
+  eIcicleError convert_montgomery(
+    const scalar_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, scalar_convert_montgomery)(input, size, is_into, &config, output);
+    return CONCAT_EXPAND(FIELD, scalar_convert_montgomery)(input, size, is_to_montgomery, &config, output);
   }
 
 #ifdef EXT_FIELD
@@ -218,16 +373,16 @@ namespace icicle {
     ExtFieldConvertMontgomeryDispatcher, extension_scalar_convert_montgomery, extFieldConvertMontgomeryImpl)
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, extension_scalar_convert_montgomery)(
-    const extension_t* input, uint64_t size, bool is_into, const VecOpsConfig* config, extension_t* output)
+    const extension_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig* config, extension_t* output)
   {
-    return ExtFieldConvertMontgomeryDispatcher::execute(input, size, is_into, *config, output);
+    return ExtFieldConvertMontgomeryDispatcher::execute(input, size, is_to_montgomery, *config, output);
   }
 
   template <>
   eIcicleError convert_montgomery(
-    const extension_t* input, uint64_t size, bool is_into, const VecOpsConfig& config, extension_t* output)
+    const extension_t* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, extension_t* output)
   {
-    return CONCAT_EXPAND(FIELD, extension_scalar_convert_montgomery)(input, size, is_into, &config, output);
+    return CONCAT_EXPAND(FIELD, extension_scalar_convert_montgomery)(input, size, is_to_montgomery, &config, output);
   }
 #endif // EXT_FIELD
 
@@ -271,11 +426,12 @@ namespace icicle {
     const scalar_t* input,
     uint64_t offset,
     uint64_t stride,
-    uint64_t size,
+    uint64_t size_in,
+    uint64_t size_out,
     const VecOpsConfig* config,
     scalar_t* output)
   {
-    return ScalarSliceDispatcher::execute(input, offset, stride, size, *config, output);
+    return ScalarSliceDispatcher::execute(input, offset, stride, size_in, size_out, *config, output);
   }
 
   template <>
@@ -283,11 +439,31 @@ namespace icicle {
     const scalar_t* input,
     uint64_t offset,
     uint64_t stride,
-    uint64_t size,
+    uint64_t size_in,
+    uint64_t size_out,
     const VecOpsConfig& config,
     scalar_t* output)
   {
-    return CONCAT_EXPAND(FIELD, slice)(input, offset, stride, size, &config, output);
+    return CONCAT_EXPAND(FIELD, slice)(input, offset, stride, size_in, size_out, &config, output);
+  }
+
+  // Deprecated API
+  template <>
+  eIcicleError slice(
+    const scalar_t* input,
+    uint64_t offset,
+    uint64_t stride,
+    uint64_t size_out,
+    const VecOpsConfig& config,
+    scalar_t* output)
+  {
+    const auto size_in = offset + stride * (size_out - 1) + 1; // input should be at least that large
+    ICICLE_LOG_WARNING << "slice api is deprecated and replace with new api. Use new slice api instead";
+    if (config.batch_size != 1) {
+      ICICLE_LOG_ERROR << "deprecated slice API does not support batch";
+      return eIcicleError::INVALID_ARGUMENT;
+    }
+    return slice(input, offset, stride, size_in, size_out, config, output);
   }
 
 #ifdef EXT_FIELD
@@ -297,11 +473,12 @@ namespace icicle {
     const extension_t* input,
     uint64_t offset,
     uint64_t stride,
-    uint64_t size,
+    uint64_t size_in,
+    uint64_t size_out,
     const VecOpsConfig* config,
     extension_t* output)
   {
-    return ExtFieldSliceDispatcher::execute(input, offset, stride, size, *config, output);
+    return ExtFieldSliceDispatcher::execute(input, offset, stride, size_in, size_out, *config, output);
   }
 
   template <>
@@ -309,15 +486,16 @@ namespace icicle {
     const extension_t* input,
     uint64_t offset,
     uint64_t stride,
-    uint64_t size,
+    uint64_t size_in,
+    uint64_t size_out,
     const VecOpsConfig& config,
     extension_t* output)
   {
-    return CONCAT_EXPAND(FIELD, extension_slice)(input, offset, stride, size, &config, output);
+    return CONCAT_EXPAND(FIELD, extension_slice)(input, offset, stride, size_in, size_out, &config, output);
   }
 #endif // EXT_FIELD
 
-  /*********************************** HIGHEST NON ZERO IDX ***********************************/
+  /*********************************** HIGHEST sizeON ZERO IDX ***********************************/
 
   ICICLE_DISPATCHER_INST(ScalarHighestNonZeroIdxDispatcher, highest_non_zero_idx, scalarHighNonZeroIdxOpImpl)
 
@@ -367,25 +545,25 @@ namespace icicle {
 
   extern "C" eIcicleError CONCAT_EXPAND(FIELD, poly_division)(
     const scalar_t* numerator,
-    int64_t numerator_deg,
-    const scalar_t* denumerator,
-    int64_t denumerator_deg,
-    const VecOpsConfig* config,
+    uint64_t numerator_size,
+    const scalar_t* denominator,
+    int64_t denominator_size,
+    const VecOpsConfig& config,
     scalar_t* q_out /*OUT*/,
     uint64_t q_size,
     scalar_t* r_out /*OUT*/,
     uint64_t r_size)
   {
     return ScalarPolyDivDispatcher::execute(
-      numerator, numerator_deg, denumerator, denumerator_deg, *config, q_out, q_size, r_out, r_size);
+      numerator, numerator_size, denominator, denominator_size, config, q_out, q_size, r_out, r_size);
   }
 
   template <>
   eIcicleError polynomial_division(
     const scalar_t* numerator,
-    int64_t numerator_deg,
-    const scalar_t* denumerator,
-    int64_t denumerator_deg,
+    uint64_t numerator_size,
+    const scalar_t* denominator,
+    uint64_t denominator_size,
     const VecOpsConfig& config,
     scalar_t* q_out /*OUT*/,
     uint64_t q_size,
@@ -393,7 +571,7 @@ namespace icicle {
     uint64_t r_size)
   {
     return CONCAT_EXPAND(FIELD, poly_division)(
-      numerator, numerator_deg, denumerator, denumerator_deg, &config, q_out, q_size, r_out, r_size);
+      numerator, numerator_size, denominator, denominator_size, config, q_out, q_size, r_out, r_size);
   }
 
 } // namespace icicle
\ No newline at end of file
diff --git a/icicle/tests/test_field_api.cpp b/icicle/tests/test_field_api.cpp
index 072142876..703018797 100644
--- a/icicle/tests/test_field_api.cpp
+++ b/icicle/tests/test_field_api.cpp
@@ -1,3 +1,4 @@
+#include <cstdint>
 #include <gtest/gtest.h>
 #include <iostream>
 #include "dlfcn.h"
@@ -14,6 +15,8 @@
 using namespace field_config;
 using namespace icicle;
 
+// TODO Hadar - add tests that test different configurations of data on device or on host.
+
 using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
 #define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
 #define END_TIMER(timer, msg, enable)                                                                                  \
@@ -22,12 +25,13 @@ using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::p
       "%s: %.3f ms\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count() / 1000);
 
 static bool VERBOSE = true;
-static int ITERS = 16;
+static int ITERS = 1;
 static inline std::string s_main_target;
 static inline std::string s_reference_target;
+static inline std::vector<std::string> s_registered_devices;
+bool s_is_cuda_registered; // TODO Yuval remove this
 
-template <typename T>
-class FieldApiTest : public ::testing::Test
+class FieldApiTestBase : public ::testing::Test
 {
 public:
   // SetUpTestSuite/TearDownTestSuite are called once for the entire test suite
@@ -38,10 +42,11 @@ class FieldApiTest : public ::testing::Test
 #endif
     icicle_load_backend_from_env_or_default();
 
-    const bool is_cuda_registered = is_device_registered("CUDA");
-    if (!is_cuda_registered) { ICICLE_LOG_ERROR << "CUDA device not found. Testing CPU vs CPU"; }
-    s_main_target = is_cuda_registered ? "CUDA" : "CPU";
+    s_is_cuda_registered = is_device_registered("CUDA");
+    if (!s_is_cuda_registered) { ICICLE_LOG_ERROR << "CUDA device not found. Testing CPU vs reference (on cpu)"; }
+    s_main_target = s_is_cuda_registered ? "CUDA" : "CPU";
     s_reference_target = "CPU";
+    s_registered_devices = get_registered_devices_list();
   }
   static void TearDownTestSuite()
   {
@@ -52,7 +57,12 @@ class FieldApiTest : public ::testing::Test
   // SetUp/TearDown are called before and after each test
   void SetUp() override {}
   void TearDown() override {}
+};
 
+template <typename T>
+class FieldApiTest : public FieldApiTestBase
+{
+public:
   void random_samples(T* arr, uint64_t count)
   {
     for (uint64_t i = 0; i < count; i++)
@@ -84,16 +94,24 @@ TYPED_TEST(FieldApiTest, FieldSanityTest)
   ASSERT_EQ(a * scalar_t::from(2), a + a);
 }
 
-TYPED_TEST(FieldApiTest, vectorOps)
+TYPED_TEST(FieldApiTest, vectorVectorOps)
 {
-  const uint64_t N = 1 << 22;
-  auto in_a = std::make_unique<TypeParam[]>(N);
-  auto in_b = std::make_unique<TypeParam[]>(N);
-  FieldApiTest<TypeParam>::random_samples(in_a.get(), N);
-  FieldApiTest<TypeParam>::random_samples(in_b.get(), N);
+  int seed = time(0);
+  srand(seed);
+  ICICLE_LOG_DEBUG << "seed = " << seed;
+  const uint64_t N = 1 << (rand() % 15 + 3);
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
 
-  auto out_main = std::make_unique<TypeParam[]>(N);
-  auto out_ref = std::make_unique<TypeParam[]>(N);
+  ICICLE_LOG_DEBUG << "N = " << N;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
+
+  const int total_size = N * batch_size;
+  auto in_a = std::make_unique<TypeParam[]>(total_size);
+  auto in_b = std::make_unique<TypeParam[]>(total_size);
+  auto out_main = std::make_unique<TypeParam[]>(total_size);
+  auto out_ref = std::make_unique<TypeParam[]>(total_size);
 
   auto vector_accumulate_wrapper =
     [](TypeParam* a, const TypeParam* b, uint64_t size, const VecOpsConfig& config, TypeParam* /*out*/) {
@@ -105,6 +123,8 @@ TYPED_TEST(FieldApiTest, vectorOps)
       Device dev = {dev_type, 0};
       icicle_set_device(dev);
       auto config = default_vec_ops_config();
+      config.batch_size = batch_size;
+      config.columns_batch = columns_batch;
 
       std::ostringstream oss;
       oss << dev_type << " " << msg;
@@ -116,45 +136,329 @@ TYPED_TEST(FieldApiTest, vectorOps)
       END_TIMER(VECADD_sync, oss.str().c_str(), measure);
     };
 
-  // warmup
-  // run(s_reference_target, out_ref.get(), false /*=measure*/, 16 /*=iters*/);
-  // run(s_main_target, out_main.get(), false /*=measure*/, 1 /*=iters*/);
+  // add
+  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
+  FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
+  if (!s_is_cuda_registered) {
+    for (int i = 0; i < total_size; i++) {
+      out_ref[i] = in_a[i] + in_b[i];
+    }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_add<TypeParam>, "vector add", ITERS);
+  }
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_add<TypeParam>, "vector add", ITERS);
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 
   // accumulate
-  auto temp_result = std::make_unique<TypeParam[]>(N);
-  auto initial_in_a = std::make_unique<TypeParam[]>(N);
-
-  std::memcpy(initial_in_a.get(), in_a.get(), N * sizeof(TypeParam));
-  run(s_reference_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS);
-  std::memcpy(temp_result.get(), in_a.get(), N * sizeof(TypeParam));
-  std::memcpy(in_a.get(), initial_in_a.get(), N * sizeof(TypeParam));
+  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
+  FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
+  for (int i = 0; i < total_size; i++) { // TODO - compare gpu against cpu with inplace operations?
+    out_ref[i] = in_a[i] + in_b[i];
+  }
   run(s_main_target, nullptr, VERBOSE /*=measure*/, vector_accumulate_wrapper, "vector accumulate", ITERS);
-  ASSERT_EQ(0, memcmp(in_a.get(), temp_result.get(), N * sizeof(TypeParam)));
 
-  // add
-  run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_add<TypeParam>, "vector add", ITERS);
-  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_add<TypeParam>, "vector add", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), N * sizeof(TypeParam)));
+  ASSERT_EQ(0, memcmp(in_a.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 
   // sub
-  run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sub<TypeParam>, "vector sub", ITERS);
+  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
+  FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
+  if (!s_is_cuda_registered) {
+    for (int i = 0; i < total_size; i++) {
+      out_ref[i] = in_a[i] - in_b[i];
+    }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sub<TypeParam>, "vector sub", ITERS);
+  }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sub<TypeParam>, "vector sub", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), N * sizeof(TypeParam)));
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 
   // mul
-  run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_mul<TypeParam>, "vector mul", ITERS);
+  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
+  FieldApiTest<TypeParam>::random_samples(in_b.get(), total_size);
+  if (!s_is_cuda_registered) {
+    for (int i = 0; i < total_size; i++) {
+      out_ref[i] = in_a[i] * in_b[i];
+    }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_mul<TypeParam>, "vector mul", ITERS);
+  }
   run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_mul<TypeParam>, "vector mul", ITERS);
-  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), N * sizeof(TypeParam)));
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
+
+  // div
+  TypeParam::rand_host_many(in_a.get(), total_size);
+  TypeParam::rand_host_many(in_b.get(), total_size);
+  // reference
+  if (!s_is_cuda_registered) {
+    for (int i = 0; i < total_size; i++) {
+      out_ref[i] = in_a[i] * TypeParam::inverse(in_b[i]);
+    }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_div<TypeParam>, "vector div", ITERS);
+  }
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_div<TypeParam>, "vector div", ITERS);
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
 }
 
-TYPED_TEST(FieldApiTest, matrixAPIsAsync)
+TYPED_TEST(FieldApiTest, montgomeryConversion)
 {
-  const int R = 1 << 10, C = 1 << 8;
-  auto h_in = std::make_unique<TypeParam[]>(R * C);
-  FieldApiTest<TypeParam>::random_samples(h_in.get(), R * C);
+  int seed = time(0);
+  srand(seed);
+  ICICLE_LOG_DEBUG << "seed = " << seed;
+  const uint64_t N = 1 << (rand() % 15 + 3);
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
+  const bool is_to_montgomery = rand() % 2;
+  ICICLE_LOG_DEBUG << "N = " << N;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
+  ICICLE_LOG_DEBUG << "is_to_montgomery = " << is_to_montgomery;
+  const int total_size = N * batch_size;
+  auto in_a = std::make_unique<TypeParam[]>(total_size);
+  auto out_main = std::make_unique<TypeParam[]>(total_size);
+  auto out_ref = std::make_unique<TypeParam[]>(total_size);
+
+  auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) {
+    Device dev = {dev_type, 0};
+    icicle_set_device(dev);
+    auto config = default_vec_ops_config();
+    config.batch_size = batch_size;
+    config.columns_batch = columns_batch;
+
+    std::ostringstream oss;
+    oss << dev_type << " " << msg;
+
+    START_TIMER(MONTGOMERY)
+    for (int i = 0; i < iters; ++i) {
+      ICICLE_CHECK(convert_montgomery(in_a.get(), N, is_to_montgomery, config, out));
+    }
+    END_TIMER(MONTGOMERY, oss.str().c_str(), measure);
+  };
 
-  auto h_out_main = std::make_unique<TypeParam[]>(R * C);
-  auto h_out_ref = std::make_unique<TypeParam[]>(R * C);
+  // convert_montgomery
+  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
+  // reference
+  if (!s_is_cuda_registered) {
+    if (is_to_montgomery) {
+      for (int i = 0; i < total_size; i++) {
+        out_ref[i] = TypeParam::to_montgomery(in_a[i]);
+      }
+    } else {
+      for (int i = 0; i < total_size; i++) {
+        out_ref[i] = TypeParam::from_montgomery(in_a[i]);
+      }
+    }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "montgomery", ITERS);
+  }
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "montgomery", ITERS);
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
+}
+
+TEST_F(FieldApiTestBase, VectorReduceOps)
+{
+  int seed = time(0);
+  srand(seed);
+  ICICLE_LOG_DEBUG << "seed = " << seed;
+  const uint64_t N = 1 << (rand() % 15 + 3);
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
+  const int total_size = N * batch_size;
+
+  ICICLE_LOG_DEBUG << "N = " << N;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
+
+  auto in_a = std::make_unique<scalar_t[]>(total_size);
+  auto out_main = std::make_unique<scalar_t[]>(batch_size);
+  auto out_ref = std::make_unique<scalar_t[]>(batch_size);
+
+  auto vector_accumulate_wrapper =
+    [](scalar_t* a, const scalar_t* b, uint64_t size, const VecOpsConfig& config, scalar_t* /*out*/) {
+      return vector_accumulate(a, b, size, config);
+    };
+
+  auto run =
+    [&](const std::string& dev_type, scalar_t* out, bool measure, auto vec_op_func, const char* msg, int iters) {
+      Device dev = {dev_type, 0};
+      icicle_set_device(dev);
+      auto config = default_vec_ops_config();
+      config.batch_size = batch_size;
+      config.columns_batch = columns_batch;
+
+      std::ostringstream oss;
+      oss << dev_type << " " << msg;
+
+      START_TIMER(VECADD_sync)
+      for (int i = 0; i < iters; ++i) {
+        ICICLE_CHECK(vec_op_func(in_a.get(), N, config, out));
+      }
+      END_TIMER(VECADD_sync, oss.str().c_str(), measure);
+    };
+
+  // sum
+  scalar_t::rand_host_many(in_a.get(), total_size);
+  // reference
+  for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+    out_ref[idx_in_batch] = scalar_t::from(0);
+  }
+  if (!s_is_cuda_registered) {
+    for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+      for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) {
+        uint64_t idx_a = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N;
+        out_ref[idx_in_batch] = out_ref[idx_in_batch] + in_a[idx_a];
+      }
+    }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_sum<scalar_t>, "vector sum", ITERS);
+  }
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_sum<scalar_t>, "vector sum", ITERS);
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(scalar_t)));
+
+  // product
+  scalar_t::rand_host_many(in_a.get(), total_size);
+  if (!s_is_cuda_registered) {
+    for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+      out_ref[idx_in_batch] = scalar_t::from(1);
+    }
+    for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+      for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) {
+        uint64_t idx_a = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N;
+        out_ref[idx_in_batch] = out_ref[idx_in_batch] * in_a[idx_a];
+      }
+    }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, vector_product<scalar_t>, "vector product", ITERS);
+  }
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, vector_product<scalar_t>, "vector product", ITERS);
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(scalar_t)));
+}
+
+TEST_F(FieldApiTestBase, scalarVectorOps)
+{
+  int seed = time(0);
+  srand(seed);
+  ICICLE_LOG_DEBUG << "seed = " << seed;
+  const uint64_t N = 1 << (rand() % 15 + 3);
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
+
+  ICICLE_LOG_DEBUG << "N = " << N;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
+
+  const int total_size = N * batch_size;
+  auto scalar_a = std::make_unique<scalar_t[]>(batch_size);
+  auto in_b = std::make_unique<scalar_t[]>(total_size);
+  auto out_main = std::make_unique<scalar_t[]>(total_size);
+  auto out_ref = std::make_unique<scalar_t[]>(total_size);
+  ICICLE_LOG_DEBUG << "N = " << N;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
+
+  auto vector_accumulate_wrapper =
+    [](scalar_t* a, const scalar_t* b, uint64_t size, const VecOpsConfig& config, scalar_t* /*out*/) {
+      return vector_accumulate(a, b, size, config);
+    };
+
+  auto run =
+    [&](const std::string& dev_type, scalar_t* out, bool measure, auto vec_op_func, const char* msg, int iters) {
+      Device dev = {dev_type, 0};
+      icicle_set_device(dev);
+      auto config = default_vec_ops_config();
+      config.batch_size = batch_size;
+      config.columns_batch = columns_batch;
+
+      std::ostringstream oss;
+      oss << dev_type << " " << msg;
+
+      START_TIMER(VECADD_sync)
+      for (int i = 0; i < iters; ++i) {
+        ICICLE_CHECK(vec_op_func(scalar_a.get(), in_b.get(), N, config, out));
+      }
+      END_TIMER(VECADD_sync, oss.str().c_str(), measure);
+    };
+
+  // scalar add vec
+  scalar_t::rand_host_many(scalar_a.get(), batch_size);
+  scalar_t::rand_host_many(in_b.get(), total_size);
+
+  // reference
+  if (!s_is_cuda_registered) {
+    for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+      for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) {
+        uint64_t idx_b = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N;
+        out_ref[idx_b] = (scalar_a[idx_in_batch]) + in_b[idx_b];
+      }
+    }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_add_vec<scalar_t>, "scalar add vec", ITERS);
+  }
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_add_vec<scalar_t>, "scalar add vec", ITERS);
+
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t)));
+
+  // scalar sub vec
+  scalar_t::rand_host_many(scalar_a.get(), batch_size);
+  scalar_t::rand_host_many(in_b.get(), total_size);
+
+  if (!s_is_cuda_registered) {
+    for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+      for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) {
+        uint64_t idx_b = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N;
+        out_ref[idx_b] = (scalar_a[idx_in_batch]) - in_b[idx_b];
+      }
+    }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_sub_vec<scalar_t>, "scalar sub vec", ITERS);
+  }
+
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_sub_vec<scalar_t>, "scalar sub vec", ITERS);
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t)));
+
+  // scalar mul vec
+  scalar_t::rand_host_many(scalar_a.get(), batch_size);
+  scalar_t::rand_host_many(in_b.get(), total_size);
+
+  if (!s_is_cuda_registered) {
+    for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+      for (uint64_t idx_in_N = 0; idx_in_N < N; idx_in_N++) {
+        uint64_t idx_b = columns_batch ? idx_in_N * batch_size + idx_in_batch : idx_in_batch * N + idx_in_N;
+        out_ref[idx_b] = (scalar_a[idx_in_batch]) * in_b[idx_b];
+      }
+    }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, scalar_mul_vec<scalar_t>, "scalar mul vec", ITERS);
+  }
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, scalar_mul_vec<scalar_t>, "scalar mul vec", ITERS);
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(scalar_t)));
+}
+
+TYPED_TEST(FieldApiTest, matrixAPIsAsync)
+{
+  int seed = time(0);
+  srand(seed);
+  ICICLE_LOG_DEBUG << "seed = " << seed;
+  const int R =
+    1
+    << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes which are not powers of 2
+  const int C =
+    1
+    << (rand() % 8 + 2); // cpu implementation for out of place transpose also supports sizes which are not powers of 2
+  const int batch_size = 1 << (rand() % 4);
+  const bool columns_batch = rand() % 2;
+  const bool is_in_place =
+    s_is_cuda_registered ? 0 : rand() % 2; // TODO - fix inplace (Hadar: I'm not sure we should support it)
+
+  ICICLE_LOG_DEBUG << "rows = " << R;
+  ICICLE_LOG_DEBUG << "cols = " << C;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
+
+  const int total_size = R * C * batch_size;
+  auto h_inout = std::make_unique<TypeParam[]>(total_size);
+  auto h_out_main = std::make_unique<TypeParam[]>(total_size);
+  auto h_out_ref = std::make_unique<TypeParam[]>(total_size);
 
   auto run = [&](const std::string& dev_type, TypeParam* h_out, bool measure, const char* msg, int iters) {
     Device dev = {dev_type, 0};
@@ -163,6 +467,8 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
     DeviceProperties device_props;
     icicle_get_device_properties(device_props);
     auto config = default_vec_ops_config();
+    config.batch_size = batch_size;
+    config.columns_batch = columns_batch;
 
     std::ostringstream oss;
     oss << dev_type << " " << msg;
@@ -172,16 +478,16 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
     TypeParam *d_in, *d_out;
     if (!device_props.using_host_memory) {
       icicle_create_stream(&config.stream);
-      icicle_malloc_async((void**)&d_in, R * C * sizeof(TypeParam), config.stream);
-      icicle_malloc_async((void**)&d_out, R * C * sizeof(TypeParam), config.stream);
-      icicle_copy_to_device_async(d_in, h_in.get(), R * C * sizeof(TypeParam), config.stream);
+      icicle_malloc_async((void**)&d_in, total_size * sizeof(TypeParam), config.stream);
+      icicle_malloc_async((void**)&d_out, total_size * sizeof(TypeParam), config.stream);
+      icicle_copy_to_device_async(d_in, h_inout.get(), total_size * sizeof(TypeParam), config.stream);
 
       config.is_a_on_device = true;
       config.is_result_on_device = true;
       config.is_async = false;
     }
 
-    TypeParam* in = device_props.using_host_memory ? h_in.get() : d_in;
+    TypeParam* in = device_props.using_host_memory ? h_inout.get() : d_in;
     TypeParam* out = device_props.using_host_memory ? h_out : d_out;
 
     START_TIMER(TRANSPOSE)
@@ -191,106 +497,367 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
     END_TIMER(TRANSPOSE, oss.str().c_str(), measure);
 
     if (!device_props.using_host_memory) {
-      icicle_copy_to_host_async(h_out, d_out, R * C * sizeof(TypeParam), config.stream);
+      icicle_copy_to_host_async(h_out, d_out, total_size * sizeof(TypeParam), config.stream);
       icicle_stream_synchronize(config.stream);
       icicle_free_async(d_in, config.stream);
       icicle_free_async(d_out, config.stream);
     }
   };
 
-  run(s_reference_target, h_out_ref.get(), VERBOSE /*=measure*/, "transpose", ITERS);
-  run(s_main_target, h_out_main.get(), VERBOSE /*=measure*/, "transpose", ITERS);
-  ASSERT_EQ(0, memcmp(h_out_main.get(), h_out_ref.get(), R * C * sizeof(TypeParam)));
+  // Option 1: Initialize each input matrix in the batch with the same ascending values
+  // for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+  //   for (uint32_t i = 0; i < R * C; i++) {
+  //     if(columns_batch){
+  //       h_inout[idx_in_batch + batch_size * i] = TypeParam::from(i);
+  //     } else {
+  //       h_inout[idx_in_batch * R * C + i] = TypeParam::from(i);
+  //     }
+  //   }
+  // }
+
+  // Option 2: Initialize the entire input array with ascending values
+  // for (int i = 0; i < total_size; i++) {
+  //   h_inout[i] = TypeParam::from(i);
+  // }
+
+  // Option 3: Initialize the entire input array with random values
+  TypeParam::rand_host_many(h_inout.get(), total_size);
+
+  // Reference implementation
+  if (!s_is_cuda_registered) {
+    const TypeParam* cur_mat_in = h_inout.get();
+    TypeParam* cur_mat_out = h_out_ref.get();
+    uint32_t stride = columns_batch ? batch_size : 1;
+    const uint64_t total_elements_one_mat = static_cast<uint64_t>(R) * C;
+    for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+      // Perform the matrix transpose
+      for (uint32_t i = 0; i < R; ++i) {
+        for (uint32_t j = 0; j < C; ++j) {
+          cur_mat_out[stride * (j * R + i)] = cur_mat_in[stride * (i * C + j)];
+        }
+      }
+      cur_mat_in += (columns_batch ? 1 : total_elements_one_mat);
+      cur_mat_out += (columns_batch ? 1 : total_elements_one_mat);
+    }
+  } else {
+    run(s_reference_target, (is_in_place ? h_inout.get() : h_out_ref.get()), VERBOSE /*=measure*/, "transpose", ITERS);
+  }
+
+  run(s_main_target, (is_in_place ? h_inout.get() : h_out_main.get()), VERBOSE /*=measure*/, "transpose", ITERS);
+
+  if (is_in_place) {
+    ASSERT_EQ(0, memcmp(h_inout.get(), h_out_ref.get(), total_size * sizeof(TypeParam)));
+  } else {
+    ASSERT_EQ(0, memcmp(h_out_main.get(), h_out_ref.get(), total_size * sizeof(TypeParam)));
+  }
 }
 
-TYPED_TEST(FieldApiTest, montgomeryConversion)
+TYPED_TEST(FieldApiTest, bitReverse)
 {
-  const uint64_t N = 1 << 18;
-  auto elements_main = std::make_unique<TypeParam[]>(N);
-  auto elements_ref = std::make_unique<TypeParam[]>(N);
-  FieldApiTest<TypeParam>::random_samples(elements_main.get(), N);
-  memcpy(elements_ref.get(), elements_main.get(), N * sizeof(TypeParam));
+  int seed = time(0);
+  srand(seed);
+  ICICLE_LOG_DEBUG << "seed = " << seed;
+  const uint64_t N = 1 << (rand() % 15 + 3);
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
+  const bool is_in_place = rand() % 2;
+  const int total_size = N * batch_size;
 
-  auto run = [&](const std::string& dev_type, TypeParam* inout, bool measure, const char* msg, int iters) {
+  ICICLE_LOG_DEBUG << "N = " << N;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
+  ICICLE_LOG_DEBUG << "is_in_place = " << is_in_place;
+
+  auto in_a = std::make_unique<TypeParam[]>(total_size);
+  auto out_main = std::make_unique<TypeParam[]>(total_size);
+  auto out_ref = std::make_unique<TypeParam[]>(total_size);
+
+  auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) {
     Device dev = {dev_type, 0};
     icicle_set_device(dev);
     auto config = default_vec_ops_config();
+    config.batch_size = batch_size;
+    config.columns_batch = columns_batch;
 
     std::ostringstream oss;
     oss << dev_type << " " << msg;
 
-    START_TIMER(MONTGOMERY)
+    START_TIMER(BIT_REVERSE)
     for (int i = 0; i < iters; ++i) {
-      ICICLE_CHECK(convert_montgomery(inout, N, true /*into montgomery*/, config, inout));
+      ICICLE_CHECK(bit_reverse(in_a.get(), N, config, out));
     }
-    END_TIMER(MONTGOMERY, oss.str().c_str(), measure);
+    END_TIMER(BIT_REVERSE, oss.str().c_str(), measure);
   };
 
-  run(s_reference_target, elements_main.get(), VERBOSE /*=measure*/, "montgomery", 1);
-  run(s_main_target, elements_ref.get(), VERBOSE /*=measure*/, "montgomery", 1);
-  ASSERT_EQ(0, memcmp(elements_main.get(), elements_ref.get(), N * sizeof(TypeParam)));
+  // // Option 1: Initialize each input vector in the batch with the same ascending values
+  // for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+  //   for (uint32_t i = 0; i < N; i++) {
+  //     if(columns_batch){
+  //       in_a[idx_in_batch + batch_size * i] = TypeParam::from(i);
+  //     } else {
+  //       in_a[idx_in_batch * N + i] = TypeParam::from(i);
+  //     }
+  //   }
+  // }
+
+  // // Option 2: Initialize the entire input array with ascending values
+  // for (int i = 0; i < total_size; i++) {
+  //   in_a[i] = TypeParam::from(i);
+  // }
+
+  // Option 3: Initialize the entire input array with random values
+  FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);
+
+  // Reference implementation
+  if (!s_is_cuda_registered || is_in_place) {
+    uint64_t logn = 0;
+    uint64_t temp = N;
+    while (temp > 1) {
+      temp >>= 1;
+      logn++;
+    }
+    // BIT REVERSE FUNCTION
+    for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+      for (uint64_t i = 0; i < N; i++) {
+        int rev = 0;
+        for (int j = 0; j < logn; ++j) {
+          if (i & (1 << j)) { rev |= 1 << (logn - 1 - j); }
+        }
+        if (columns_batch) {
+          out_ref[idx_in_batch + batch_size * i] = in_a[idx_in_batch + batch_size * rev];
+        } else {
+          out_ref[idx_in_batch * N + i] = in_a[idx_in_batch * N + rev];
+        }
+      }
+    }
+  } else {
+    run(s_reference_target, (is_in_place ? in_a.get() : out_ref.get()), VERBOSE /*=measure*/, "bit-reverse", 1);
+  }
+  run(s_main_target, (is_in_place ? in_a.get() : out_main.get()), VERBOSE /*=measure*/, "bit-reverse", 1);
+
+  if (is_in_place) {
+    ASSERT_EQ(0, memcmp(in_a.get(), out_ref.get(), N * sizeof(TypeParam)));
+  } else {
+    ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size * sizeof(TypeParam)));
+  }
 }
 
-TYPED_TEST(FieldApiTest, bitReverse)
+TYPED_TEST(FieldApiTest, Slice)
+{
+  int seed = time(0);
+  srand(seed);
+  ICICLE_LOG_DEBUG << "seed = " << seed;
+  const uint64_t size_in = 1 << (rand() % 15 + 5);
+  const uint64_t offset = rand() % 15;
+  const uint64_t stride = rand() % 4 + 1;
+  const uint64_t size_out = rand() % (((size_in - offset) / stride) - 1) + 1;
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
+
+  ICICLE_LOG_DEBUG << "size_in = " << size_in;
+  ICICLE_LOG_DEBUG << "size_out = " << size_out;
+  ICICLE_LOG_DEBUG << "offset = " << offset;
+  ICICLE_LOG_DEBUG << "stride = " << stride;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
+
+  const int total_size_in = size_in * batch_size;
+  const int total_size_out = size_out * batch_size;
+
+  auto in_a = std::make_unique<TypeParam[]>(total_size_in);
+  auto out_main = std::make_unique<TypeParam[]>(total_size_out);
+  auto out_ref = std::make_unique<TypeParam[]>(total_size_out);
+
+  TypeParam::rand_host_many(in_a.get(), total_size_in);
+
+  auto run = [&](const std::string& dev_type, TypeParam* out, bool measure, const char* msg, int iters) {
+    Device dev = {dev_type, 0};
+    icicle_set_device(dev);
+    auto config = default_vec_ops_config();
+    config.batch_size = batch_size;
+    config.columns_batch = columns_batch;
+
+    std::ostringstream oss;
+    oss << dev_type << " " << msg;
+
+    START_TIMER(SLICE)
+    for (int i = 0; i < iters; ++i) {
+      ICICLE_CHECK(slice(in_a.get(), offset, stride, size_in, size_out, config, out));
+    }
+    END_TIMER(SLICE, oss.str().c_str(), measure);
+  };
+
+  // Reference implementation
+  if (!s_is_cuda_registered) {
+    for (uint64_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
+      for (uint64_t i = 0; i < size_out; i++) {
+        if (columns_batch) {
+          out_ref[idx_in_batch + batch_size * i] = in_a[idx_in_batch + batch_size * (offset + i * stride)];
+        } else {
+          out_ref[idx_in_batch * size_out + i] = in_a[idx_in_batch * size_in + (offset + i * stride)];
+        }
+      }
+    }
+  } else {
+    run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "slice", 1);
+  }
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "slice", 1);
+
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_size_out * sizeof(TypeParam)));
+}
+
+TEST_F(FieldApiTestBase, highestNonZeroIdx)
 {
-  const uint64_t N = 1 << 18;
-  auto elements_main = std::make_unique<TypeParam[]>(N);
-  auto elements_ref = std::make_unique<TypeParam[]>(N);
-  FieldApiTest<TypeParam>::random_samples(elements_main.get(), N);
-  memcpy(elements_ref.get(), elements_main.get(), N * sizeof(TypeParam));
+  int seed = time(0);
+  srand(seed);
+  ICICLE_LOG_DEBUG << "seed = " << seed;
+  const uint64_t N = 1 << (rand() % 15 + 3);
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
+  const int total_size = N * batch_size;
+
+  auto in_a = std::make_unique<scalar_t[]>(total_size);
+  for (int i = 0; i < batch_size; ++i) {
+    // randomize different rows with zeros in the end
+    auto size = std::max(int64_t(N) / 4 - i, int64_t(1));
+    scalar_t::rand_host_many(in_a.get() + i * N, size);
+  }
+  auto out_main = std::make_unique<int64_t[]>(batch_size);
+  auto out_ref = std::make_unique<int64_t[]>(batch_size);
 
-  auto run = [&](const std::string& dev_type, TypeParam* inout, bool measure, const char* msg, int iters) {
+  auto run = [&](const std::string& dev_type, int64_t* out, bool measure, const char* msg, int iters) {
     Device dev = {dev_type, 0};
     icicle_set_device(dev);
     auto config = default_vec_ops_config();
+    config.batch_size = batch_size;
+    config.columns_batch = columns_batch;
 
     std::ostringstream oss;
     oss << dev_type << " " << msg;
 
-    START_TIMER(BIT_REVERSE)
+    START_TIMER(highestNonZeroIdx)
     for (int i = 0; i < iters; ++i) {
-      ICICLE_CHECK(bit_reverse(inout, N, config, inout));
+      ICICLE_CHECK(highest_non_zero_idx(in_a.get(), N, config, out));
     }
-    END_TIMER(BIT_REVERSE, oss.str().c_str(), measure);
+    END_TIMER(highestNonZeroIdx, oss.str().c_str(), measure);
   };
 
-  run(s_reference_target, elements_main.get(), VERBOSE /*=measure*/, "bit-reverse", 1);
-  run(s_main_target, elements_ref.get(), VERBOSE /*=measure*/, "bit-reverse", 1);
-  ASSERT_EQ(0, memcmp(elements_main.get(), elements_ref.get(), N * sizeof(TypeParam)));
+  run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "highest_non_zero_idx", 1);
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "highest_non_zero_idx", 1);
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), batch_size * sizeof(int64_t)));
 }
 
-TYPED_TEST(FieldApiTest, Slice)
+TEST_F(FieldApiTestBase, polynomialEval)
 {
-  const uint64_t N = 1 << 18;
-  const uint64_t offset = 2;
-  const uint64_t stride = 3;
-  const uint64_t size = 4;
+  int seed = time(0);
+  srand(seed);
+  ICICLE_LOG_DEBUG << "seed = " << seed;
+  const uint64_t coeffs_size = 1 << (rand() % 10 + 4);
+  const uint64_t domain_size = 1 << (rand() % 8 + 2);
+  const int batch_size = 1 << (rand() % 5);
+  const bool columns_batch = rand() % 2;
+
+  ICICLE_LOG_DEBUG << "coeffs_size = " << coeffs_size;
+  ICICLE_LOG_DEBUG << "domain_size = " << domain_size;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
+
+  const int total_coeffs_size = coeffs_size * batch_size;
+  const int total_result_size = domain_size * batch_size;
+
+  auto in_coeffs = std::make_unique<scalar_t[]>(total_coeffs_size);
+  auto in_domain = std::make_unique<scalar_t[]>(domain_size);
+  auto out_main = std::make_unique<scalar_t[]>(total_result_size);
+  auto out_ref = std::make_unique<scalar_t[]>(total_result_size);
+
+  auto run = [&](const std::string& dev_type, scalar_t* out, bool measure, const char* msg, int iters) {
+    Device dev = {dev_type, 0};
+    icicle_set_device(dev);
+    auto config = default_vec_ops_config();
+    config.batch_size = batch_size;
+    config.columns_batch = columns_batch;
 
-  auto elements_main = std::make_unique<TypeParam[]>(N);
-  auto elements_ref = std::make_unique<TypeParam[]>(size);
-  auto elements_out = std::make_unique<TypeParam[]>(size);
+    std::ostringstream oss;
+    oss << dev_type << " " << msg;
 
-  FieldApiTest<TypeParam>::random_samples(elements_main.get(), N);
+    START_TIMER(polynomialEval)
+    for (int i = 0; i < iters; ++i) {
+      ICICLE_CHECK(polynomial_eval(in_coeffs.get(), coeffs_size, in_domain.get(), domain_size, config, out));
+    }
+    END_TIMER(polynomialEval, oss.str().c_str(), measure);
+  };
 
-  auto run =
-    [&](const std::string& dev_type, const TypeParam* in, TypeParam* out, bool measure, const char* msg, int iters) {
-      Device dev = {dev_type, 0};
-      icicle_set_device(dev);
-      auto config = VecOpsConfig(); // Adjust configuration as needed
+  scalar_t::rand_host_many(in_coeffs.get(), total_coeffs_size);
+  scalar_t::rand_host_many(in_domain.get(), domain_size);
 
-      std::ostringstream oss;
-      oss << dev_type << " " << msg;
+  run(s_main_target, out_main.get(), VERBOSE /*=measure*/, "polynomial_eval", 1);
+  run(s_reference_target, out_ref.get(), VERBOSE /*=measure*/, "polynomial_eval", 1);
+  ASSERT_EQ(0, memcmp(out_main.get(), out_ref.get(), total_result_size * sizeof(scalar_t)));
+}
 
-      START_TIMER(SLICE)
-      for (int i = 0; i < iters; ++i) {
-        ICICLE_CHECK(slice(in, offset, stride, size, config, out));
+TEST_F(FieldApiTestBase, polynomialDivision)
+{
+  const uint64_t numerator_size = 1 << 4;
+  const uint64_t denominator_size = 1 << 3;
+  const uint64_t q_size = numerator_size - denominator_size + 1;
+  const uint64_t r_size = numerator_size;
+  const int batch_size = 10 + rand() % 10;
+
+  // basically we compute q(x),r(x) for a(x)=q(x)b(x)+r(x) by dividing a(x)/b(x)
+
+  // randomize matrix with rows/cols as polynomials
+  auto numerator = std::make_unique<scalar_t[]>(numerator_size * batch_size);
+  auto denominator = std::make_unique<scalar_t[]>(denominator_size * batch_size);
+  scalar_t::rand_host_many(numerator.get(), numerator_size * batch_size);
+  scalar_t::rand_host_many(denominator.get(), denominator_size * batch_size);
+
+  // Add padding to each row so that the degree is lower than the size
+  const int zero_pad_length = 5;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < zero_pad_length; ++j) {
+      numerator[i * numerator_size + numerator_size - zero_pad_length + j] = scalar_t::zero();
+      denominator[i * denominator_size + denominator_size - zero_pad_length + j] = scalar_t::zero();
+    }
+  }
+
+  for (auto device : s_registered_devices) {
+    ICICLE_CHECK(icicle_set_device(device));
+    for (int columns_batch = 0; columns_batch <= 1; columns_batch++) {
+      ICICLE_LOG_DEBUG << "testing polynomial division on device " << device << " [column_batch=" << columns_batch
+                       << "]";
+      auto q = std::make_unique<scalar_t[]>(q_size * batch_size);
+      auto r = std::make_unique<scalar_t[]>(r_size * batch_size);
+
+      auto config = default_vec_ops_config();
+      config.batch_size = columns_batch ? batch_size - zero_pad_length : batch_size; // skip the zero cols
+      config.columns_batch = columns_batch;
+      // TODO v3.2 support column batch for this API
+      if (columns_batch) {
+        ICICLE_LOG_INFO << "Skipping polynomial division column batch";
+        continue;
       }
-      END_TIMER(SLICE, oss.str().c_str(), measure);
-    };
 
-  run(s_reference_target, elements_main.get(), elements_ref.get(), VERBOSE /*=measure*/, "slice", 1);
-  run(s_main_target, elements_main.get(), elements_out.get(), VERBOSE /*=measure*/, "slice", 1);
-  ASSERT_EQ(0, memcmp(elements_ref.get(), elements_out.get(), size * sizeof(TypeParam)));
+      ICICLE_CHECK(polynomial_division(
+        numerator.get(), numerator_size, denominator.get(), denominator_size, config, q.get(), q_size, r.get(),
+        r_size));
+
+      // test a(x)=q(x)b(x)+r(x) in random point
+      const auto rand_x = scalar_t::rand_host();
+      auto ax = std::make_unique<scalar_t[]>(config.batch_size);
+      auto bx = std::make_unique<scalar_t[]>(config.batch_size);
+      auto qx = std::make_unique<scalar_t[]>(config.batch_size);
+      auto rx = std::make_unique<scalar_t[]>(config.batch_size);
+      polynomial_eval(numerator.get(), numerator_size, &rand_x, 1, config, ax.get());
+      polynomial_eval(denominator.get(), denominator_size, &rand_x, 1, config, bx.get());
+      polynomial_eval(q.get(), q_size, &rand_x, 1, config, qx.get());
+      polynomial_eval(r.get(), r_size, &rand_x, 1, config, rx.get());
+
+      for (int i = 0; i < config.batch_size; ++i) {
+        // ICICLE_LOG_DEBUG << "ax=" << ax[i] << ", bx=" << bx[i] << ", qx=" << qx[i] << ", rx=" << rx[i];
+        ASSERT_EQ(ax[i], qx[i] * bx[i] + rx[i]);
+      }
+    }
+  }
 }
 
 #ifdef NTT
@@ -301,13 +868,15 @@ TYPED_TEST(FieldApiTest, ntt)
 
   int seed = time(0);
   srand(seed);
+  ICICLE_LOG_DEBUG << "seed = " << seed;
   const bool inplace = rand() % 2;
   const int logn = rand() % 15 + 3;
   const uint64_t N = 1 << logn;
   const int log_ntt_domain_size = logn + 1;
   const int log_batch_size = rand() % 3;
   const int batch_size = 1 << log_batch_size;
-  const Ordering ordering = static_cast<Ordering>(rand() % 4);
+  const int _ordering = rand() % 4;
+  const Ordering ordering = static_cast<Ordering>(_ordering);
   bool columns_batch;
   if (logn == 7 || logn < 4) {
     columns_batch = false; // currently not supported (icicle_v3/backend/cuda/src/ntt/ntt.cuh line 578)
@@ -323,9 +892,17 @@ TYPED_TEST(FieldApiTest, ntt)
     coset_gen = scalar_t::one();
   }
 
+  ICICLE_LOG_DEBUG << "N = " << N;
+  ICICLE_LOG_DEBUG << "batch_size = " << batch_size;
+  ICICLE_LOG_DEBUG << "columns_batch = " << columns_batch;
+  ICICLE_LOG_DEBUG << "inplace = " << inplace;
+  ICICLE_LOG_DEBUG << "ordering = " << _ordering;
+  ICICLE_LOG_DEBUG << "log_coset_stride = " << log_coset_stride;
+
   const int total_size = N * batch_size;
   auto scalars = std::make_unique<TypeParam[]>(total_size);
-  FieldApiTest<TypeParam>::random_samples(scalars.get(), total_size);
+  TypeParam::rand_host_many(scalars.get(), total_size);
+
   auto out_main = std::make_unique<TypeParam[]>(total_size);
   auto out_ref = std::make_unique<TypeParam[]>(total_size);
   auto run = [&](const std::string& dev_type, TypeParam* out, const char* msg, bool measure, int iters) {
diff --git a/scripts/release/build_all.sh b/scripts/release/build_all.sh
index cbb4b8860..b8050fb70 100755
--- a/scripts/release/build_all.sh
+++ b/scripts/release/build_all.sh
@@ -32,25 +32,25 @@ docker run --rm --gpus all                  \
             -v ./icicle:/icicle             \
             -v "$output_dir:/output"        \
             -v ./scripts:/scripts           \
-            icicle-release-ubuntu22-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubuntu22 cuda122 &
+            icicle-release-ubuntu22-cuda122 bash /scripts/release/build_release_and_tar.sh icicle_3_1_0 ubuntu22 cuda122 &
 
 # ubuntu 20
 docker run --rm --gpus all                  \
             -v ./icicle:/icicle             \
             -v "$output_dir:/output"        \
             -v ./scripts:/scripts           \
-            icicle-release-ubuntu20-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubuntu20 cuda122 &
+            icicle-release-ubuntu20-cuda122 bash /scripts/release/build_release_and_tar.sh icicle_3_1_0 ubuntu20 cuda122 &
 
 # ubi 8 (rhel compatible)
 docker run --rm --gpus all                  \
             -v ./icicle:/icicle             \
             -v "$output_dir:/output"        \
             -v ./scripts:/scripts           \
-            icicle-release-ubi8-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubi8 cuda122 &
+            icicle-release-ubi8-cuda122 bash /scripts/release/build_release_and_tar.sh icicle_3_1_0 ubi8 cuda122 &
 
 # ubi 9 (rhel compatible)
 docker run --rm --gpus all                  \
             -v ./icicle:/icicle             \
             -v "$output_dir:/output"        \
             -v ./scripts:/scripts           \
-            icicle-release-ubi9-cuda122 bash /scripts/release/build_release_and_tar.sh icicle30 ubi9 cuda122 &
+            icicle-release-ubi9-cuda122 bash /scripts/release/build_release_and_tar.sh icicle_3_1_0 ubi9 cuda122 &
diff --git a/wrappers/golang/core/vec_ops.go b/wrappers/golang/core/vec_ops.go
index 08b87ef08..3671f0653 100644
--- a/wrappers/golang/core/vec_ops.go
+++ b/wrappers/golang/core/vec_ops.go
@@ -29,7 +29,15 @@ type VecOpsConfig struct {
 	/// non-blocking and you'll need to synchronize it explicitly by calling
 	/// `SynchronizeStream`. If set to false, the function will block the current CPU thread.
 	IsAsync bool
-	Ext     config_extension.ConfigExtensionHandler
+	/// Number of vectors (or operations) to process in a batch.
+	/// Each vector operation will be performed independently on each batch element.
+	/// Default value: 1.
+	BatchSize int32
+	/// True if the batched vectors are stored as columns in a 2D array (i.e., the vectors are
+	/// strided in memory as columns of a matrix). If false, the batched vectors are stored
+	/// contiguously in memory (e.g., as rows or in a flat array). Default value: false.
+	ColumnsBatch bool
+	Ext          config_extension.ConfigExtensionHandler
 }
 
 /**
@@ -43,6 +51,8 @@ func DefaultVecOpsConfig() VecOpsConfig {
 		false, // isBOnDevice
 		false, // isResultOnDevice
 		false, // IsAsync
+		1,     // BatchSize
+		false, // ColumnsBatch
 		nil,   // Ext
 	}
 
diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
index ba22b776d..58e571d52 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
@@ -13,6 +13,8 @@ pub struct VecOpsConfig {
     pub is_b_on_device: bool,
     pub is_result_on_device: bool,
     pub is_async: bool,
+    pub batch_size: i32,
+    pub columns_batch: bool,
     pub ext: ConfigExtension,
 }
 
@@ -24,6 +26,8 @@ impl VecOpsConfig {
             is_b_on_device: false,
             is_result_on_device: false,
             is_async: false,
+            batch_size: 1,
+            columns_batch: false,
             ext: ConfigExtension::new(),
         }
     }
@@ -58,6 +62,46 @@ pub trait VecOps<F> {
         cfg: &VecOpsConfig,
     ) -> Result<(), eIcicleError>;
 
+    fn div(
+        a: &(impl HostOrDeviceSlice<F> + ?Sized),
+        b: &(impl HostOrDeviceSlice<F> + ?Sized),
+        result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+        cfg: &VecOpsConfig,
+    ) -> Result<(), eIcicleError>;
+
+    fn sum(
+        a: &(impl HostOrDeviceSlice<F> + ?Sized),
+        result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+        cfg: &VecOpsConfig,
+    ) -> Result<(), eIcicleError>;
+
+    fn product(
+        a: &(impl HostOrDeviceSlice<F> + ?Sized),
+        result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+        cfg: &VecOpsConfig,
+    ) -> Result<(), eIcicleError>;
+
+    fn scalar_add(
+        a: &(impl HostOrDeviceSlice<F> + ?Sized),
+        b: &(impl HostOrDeviceSlice<F> + ?Sized),
+        result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+        cfg: &VecOpsConfig,
+    ) -> Result<(), eIcicleError>;
+
+    fn scalar_sub(
+        a: &(impl HostOrDeviceSlice<F> + ?Sized),
+        b: &(impl HostOrDeviceSlice<F> + ?Sized),
+        result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+        cfg: &VecOpsConfig,
+    ) -> Result<(), eIcicleError>;
+
+    fn scalar_mul(
+        a: &(impl HostOrDeviceSlice<F> + ?Sized),
+        b: &(impl HostOrDeviceSlice<F> + ?Sized),
+        result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+        cfg: &VecOpsConfig,
+    ) -> Result<(), eIcicleError>;
+
     fn transpose(
         input: &(impl HostOrDeviceSlice<F> + ?Sized),
         nof_rows: u32,
@@ -76,6 +120,16 @@ pub trait VecOps<F> {
         input: &mut (impl HostOrDeviceSlice<F> + ?Sized),
         cfg: &VecOpsConfig,
     ) -> Result<(), eIcicleError>;
+
+    fn slice(
+        input: &(impl HostOrDeviceSlice<F> + ?Sized),
+        offset: u64,
+        stride: u64,
+        size_in: u64,
+        size_out: u64,
+        cfg: &VecOpsConfig,
+        output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+    ) -> Result<(), eIcicleError>;
 }
 
 fn check_vec_ops_args<'a, F>(
@@ -166,6 +220,88 @@ where
     <<F as FieldImpl>::Config as VecOps<F>>::mul(a, b, result, &cfg)
 }
 
+pub fn div_scalars<F>(
+    a: &(impl HostOrDeviceSlice<F> + ?Sized),
+    b: &(impl HostOrDeviceSlice<F> + ?Sized),
+    result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+    cfg: &VecOpsConfig,
+) -> Result<(), eIcicleError>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: VecOps<F>,
+{
+    let cfg = check_vec_ops_args(a, b, result, cfg);
+    <<F as FieldImpl>::Config as VecOps<F>>::div(a, b, result, &cfg)
+}
+
+pub fn sum_scalars<F>(
+    a: &(impl HostOrDeviceSlice<F> + ?Sized),
+    result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+    cfg: &VecOpsConfig,
+) -> Result<(), eIcicleError>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: VecOps<F>,
+{
+    let cfg = check_vec_ops_args(a, a, result, cfg); //TODO: emirsoyturk
+    <<F as FieldImpl>::Config as VecOps<F>>::sum(a, result, &cfg)
+}
+
+pub fn product_scalars<F>(
+    a: &(impl HostOrDeviceSlice<F> + ?Sized),
+    result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+    cfg: &VecOpsConfig,
+) -> Result<(), eIcicleError>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: VecOps<F>,
+{
+    let cfg = check_vec_ops_args(a, a, result, cfg); //TODO: emirsoyturk
+    <<F as FieldImpl>::Config as VecOps<F>>::product(a, result, &cfg)
+}
+
+pub fn scalar_add<F>(
+    a: &(impl HostOrDeviceSlice<F> + ?Sized),
+    b: &(impl HostOrDeviceSlice<F> + ?Sized),
+    result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+    cfg: &VecOpsConfig,
+) -> Result<(), eIcicleError>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: VecOps<F>,
+{
+    let cfg = check_vec_ops_args(b, b, result, cfg); //TODO: emirsoyturk
+    <<F as FieldImpl>::Config as VecOps<F>>::scalar_add(a, b, result, &cfg)
+}
+
+pub fn scalar_sub<F>(
+    a: &(impl HostOrDeviceSlice<F> + ?Sized),
+    b: &(impl HostOrDeviceSlice<F> + ?Sized),
+    result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+    cfg: &VecOpsConfig,
+) -> Result<(), eIcicleError>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: VecOps<F>,
+{
+    let cfg = check_vec_ops_args(b, b, result, cfg); //TODO: emirsoyturk
+    <<F as FieldImpl>::Config as VecOps<F>>::scalar_sub(a, b, result, &cfg)
+}
+
+pub fn scalar_mul<F>(
+    a: &(impl HostOrDeviceSlice<F> + ?Sized),
+    b: &(impl HostOrDeviceSlice<F> + ?Sized),
+    result: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+    cfg: &VecOpsConfig,
+) -> Result<(), eIcicleError>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: VecOps<F>,
+{
+    let cfg = check_vec_ops_args(b, b, result, cfg); //TODO: emirsoyturk
+    <<F as FieldImpl>::Config as VecOps<F>>::scalar_mul(a, b, result, &cfg)
+}
+
 pub fn transpose_matrix<F>(
     input: &(impl HostOrDeviceSlice<F> + ?Sized),
     nof_rows: u32,
@@ -205,6 +341,22 @@ where
     <<F as FieldImpl>::Config as VecOps<F>>::bit_reverse_inplace(input, &cfg)
 }
 
+pub fn slice<F>(
+    input: &(impl HostOrDeviceSlice<F> + ?Sized),
+    offset: u64,
+    stride: u64,
+    size_in: u64,
+    size_out: u64,
+    cfg: &VecOpsConfig,
+    output: &mut (impl HostOrDeviceSlice<F> + ?Sized),
+) -> Result<(), eIcicleError>
+where
+    F: FieldImpl,
+    <F as FieldImpl>::Config: VecOps<F>,
+{
+    <<F as FieldImpl>::Config as VecOps<F>>::slice(input, offset, stride, size_in, size_out, &cfg, output)
+}
+
 #[macro_export]
 macro_rules! impl_vec_ops_field {
     (
@@ -255,6 +407,58 @@ macro_rules! impl_vec_ops_field {
                     result: *mut $field,
                 ) -> eIcicleError;
 
+                #[link_name = concat!($field_prefix, "_vector_div")]
+                pub(crate) fn vector_div_ffi(
+                    a: *const $field,
+                    b: *const $field,
+                    size: u32,
+                    cfg: *const VecOpsConfig,
+                    result: *mut $field,
+                ) -> eIcicleError;
+
+                #[link_name = concat!($field_prefix, "_vector_sum")]
+                pub(crate) fn vector_sum_ffi(
+                    a: *const $field,
+                    size: u32,
+                    cfg: *const VecOpsConfig,
+                    result: *mut $field,
+                ) -> eIcicleError;
+
+                #[link_name = concat!($field_prefix, "_vector_product")]
+                pub(crate) fn vector_product_ffi(
+                    a: *const $field,
+                    size: u32,
+                    cfg: *const VecOpsConfig,
+                    result: *mut $field,
+                ) -> eIcicleError;
+
+                #[link_name = concat!($field_prefix, "_scalar_add_vec")]
+                pub(crate) fn scalar_add_ffi(
+                    a: *const $field,
+                    b: *const $field,
+                    size: u32,
+                    cfg: *const VecOpsConfig,
+                    result: *mut $field,
+                ) -> eIcicleError;
+
+                #[link_name = concat!($field_prefix, "_scalar_sub_vec")]
+                pub(crate) fn scalar_sub_ffi(
+                    a: *const $field,
+                    b: *const $field,
+                    size: u32,
+                    cfg: *const VecOpsConfig,
+                    result: *mut $field,
+                ) -> eIcicleError;
+
+                #[link_name = concat!($field_prefix, "_scalar_mul_vec")]
+                pub(crate) fn scalar_mul_ffi(
+                    a: *const $field,
+                    b: *const $field,
+                    size: u32,
+                    cfg: *const VecOpsConfig,
+                    result: *mut $field,
+                ) -> eIcicleError;
+
                 #[link_name = concat!($field_prefix, "_matrix_transpose")]
                 pub(crate) fn matrix_transpose_ffi(
                     input: *const $field,
@@ -271,6 +475,17 @@ macro_rules! impl_vec_ops_field {
                     config: *const VecOpsConfig,
                     output: *mut $field,
                 ) -> eIcicleError;
+
+                #[link_name = concat!($field_prefix, "_slice")]
+                pub(crate) fn slice_ffi(
+                    input: *const $field,
+                    offset: u64,
+                    stride: u64,
+                    size_in: u64,
+                    size_out: u64,
+                    cfg: *const VecOpsConfig,
+                    output: *mut $field,
+                ) -> eIcicleError;
             }
         }
 
@@ -345,6 +560,110 @@ macro_rules! impl_vec_ops_field {
                 }
             }
 
+            fn div(
+                a: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                b: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                result: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
+                cfg: &VecOpsConfig,
+            ) -> Result<(), eIcicleError> {
+                unsafe {
+                    $field_prefix_ident::vector_div_ffi(
+                        a.as_ptr(),
+                        b.as_ptr(),
+                        a.len() as u32,
+                        cfg as *const VecOpsConfig,
+                        result.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
+            }
+
+            fn sum(
+                a: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                result: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
+                cfg: &VecOpsConfig,
+            ) -> Result<(), eIcicleError> {
+                unsafe {
+                    $field_prefix_ident::vector_sum_ffi(
+                        a.as_ptr(),
+                        a.len() as u32,
+                        cfg as *const VecOpsConfig,
+                        result.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
+            }
+
+            fn product(
+                a: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                result: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
+                cfg: &VecOpsConfig,
+            ) -> Result<(), eIcicleError> {
+                unsafe {
+                    $field_prefix_ident::vector_sum_ffi(
+                        a.as_ptr(),
+                        a.len() as u32,
+                        cfg as *const VecOpsConfig,
+                        result.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
+            }
+
+            fn scalar_add(
+                a: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                b: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                result: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
+                cfg: &VecOpsConfig,
+            ) -> Result<(), eIcicleError> {
+                unsafe {
+                    $field_prefix_ident::scalar_add_ffi(
+                        a.as_ptr(),
+                        b.as_ptr(),
+                        b.len() as u32,
+                        cfg as *const VecOpsConfig,
+                        result.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
+            }
+
+            fn scalar_sub(
+                a: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                b: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                result: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
+                cfg: &VecOpsConfig,
+            ) -> Result<(), eIcicleError> {
+                unsafe {
+                    $field_prefix_ident::scalar_sub_ffi(
+                        a.as_ptr(),
+                        b.as_ptr(),
+                        b.len() as u32,
+                        cfg as *const VecOpsConfig,
+                        result.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
+            }
+
+            fn scalar_mul(
+                a: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                b: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                result: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
+                cfg: &VecOpsConfig,
+            ) -> Result<(), eIcicleError> {
+                unsafe {
+                    $field_prefix_ident::scalar_mul_ffi(
+                        a.as_ptr(),
+                        b.as_ptr(),
+                        b.len() as u32,
+                        cfg as *const VecOpsConfig,
+                        result.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
+            }
+
             fn transpose(
                 input: &(impl HostOrDeviceSlice<$field> + ?Sized),
                 nof_rows: u32,
@@ -394,6 +713,29 @@ macro_rules! impl_vec_ops_field {
                     .wrap()
                 }
             }
+
+            fn slice(
+                input: &(impl HostOrDeviceSlice<$field> + ?Sized),
+                offset: u64,
+                stride: u64,
+                size_in: u64,
+                size_out: u64,
+                cfg: &VecOpsConfig,
+                output: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
+            ) -> Result<(), eIcicleError> {
+                unsafe {
+                    $field_prefix_ident::slice_ffi(
+                        input.as_ptr(),
+                        offset,
+                        stride,
+                        size_in,
+                        size_out,
+                        cfg as *const VecOpsConfig,
+                        output.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
+            }
         }
     };
 }
@@ -436,6 +778,12 @@ macro_rules! impl_vec_ops_tests {
                 initialize();
                 check_bit_reverse_inplace::<$field>()
             }
+
+            #[test]
+            pub fn test_slice() {
+                initialize();
+                check_slice::<$field>()
+            }
         }
     };
 }
diff --git a/wrappers/rust/icicle-core/src/vec_ops/tests.rs b/wrappers/rust/icicle-core/src/vec_ops/tests.rs
index 6762f06c9..0dbd4c9a3 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/tests.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/tests.rs
@@ -2,8 +2,9 @@
 use crate::test_utilities;
 use crate::traits::GenerateRandom;
 use crate::vec_ops::{
-    accumulate_scalars, add_scalars, bit_reverse, bit_reverse_inplace, mul_scalars, sub_scalars, transpose_matrix,
-    FieldImpl, VecOps, VecOpsConfig,
+    accumulate_scalars, add_scalars, bit_reverse, bit_reverse_inplace, div_scalars, mul_scalars, product_scalars,
+    scalar_add, scalar_mul, scalar_sub, slice, sub_scalars, sum_scalars, transpose_matrix, FieldImpl, VecOps,
+    VecOpsConfig,
 };
 use icicle_runtime::device::Device;
 use icicle_runtime::memory::{DeviceVec, HostSlice};
@@ -44,6 +45,12 @@ where
     check_vec_ops_scalars_add::<F>(test_size);
     check_vec_ops_scalars_sub::<F>(test_size);
     check_vec_ops_scalars_mul::<F>(test_size);
+    check_vec_ops_scalars_div::<F>(test_size);
+    check_vec_ops_scalars_sum::<F>(test_size);
+    check_vec_ops_scalars_product::<F>(test_size);
+    check_vec_ops_scalars_add_scalar::<F>(test_size);
+    check_vec_ops_scalars_sub_scalar::<F>(test_size);
+    check_vec_ops_scalars_mul_scalar::<F>(test_size);
     check_vec_ops_scalars_accumulate::<F>(test_size);
 }
 
@@ -140,6 +147,191 @@ where
         .unwrap();
 }
 
+pub fn check_vec_ops_scalars_div<F: FieldImpl>(test_size: usize)
+where
+    <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
+{
+    let a_main = F::Config::generate_random(test_size);
+    let b = F::Config::generate_random(test_size);
+    let mut result_main = vec![F::zero(); test_size];
+    let mut result_ref = vec![F::zero(); test_size];
+
+    let a_main = HostSlice::from_slice(&a_main);
+    let b = HostSlice::from_slice(&b);
+    let result_main = HostSlice::from_mut_slice(&mut result_main);
+    let result_ref = HostSlice::from_mut_slice(&mut result_ref);
+
+    let mut stream = IcicleStream::create().unwrap();
+    let mut cfg = VecOpsConfig::default();
+    cfg.stream_handle = *stream;
+
+    test_utilities::test_set_main_device();
+    div_scalars(a_main, b, result_main, &cfg).unwrap();
+
+    test_utilities::test_set_ref_device();
+    div_scalars(a_main, b, result_ref, &cfg).unwrap();
+
+    assert_eq!(result_main.as_slice(), result_ref.as_slice());
+
+    stream
+        .destroy()
+        .unwrap();
+}
+
+pub fn check_vec_ops_scalars_sum<F: FieldImpl>(test_size: usize)
+where
+    <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
+{
+    let a_main = F::Config::generate_random(test_size);
+    let mut result_main = vec![F::zero(); test_size];
+    let mut result_ref = vec![F::zero(); test_size];
+
+    let a_main = HostSlice::from_slice(&a_main);
+    let result_main = HostSlice::from_mut_slice(&mut result_main);
+    let result_ref = HostSlice::from_mut_slice(&mut result_ref);
+
+    let mut stream = IcicleStream::create().unwrap();
+    let mut cfg = VecOpsConfig::default();
+    cfg.stream_handle = *stream;
+
+    test_utilities::test_set_main_device();
+    sum_scalars(a_main, result_main, &cfg).unwrap();
+
+    test_utilities::test_set_ref_device();
+    sum_scalars(a_main, result_ref, &cfg).unwrap();
+
+    assert_eq!(result_main.as_slice(), result_ref.as_slice());
+
+    stream
+        .destroy()
+        .unwrap();
+}
+
+pub fn check_vec_ops_scalars_product<F: FieldImpl>(test_size: usize)
+where
+    <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
+{
+    let a_main = F::Config::generate_random(test_size);
+    let mut result_main = vec![F::zero(); test_size];
+    let mut result_ref = vec![F::zero(); test_size];
+
+    let a_main = HostSlice::from_slice(&a_main);
+    let result_main = HostSlice::from_mut_slice(&mut result_main);
+    let result_ref = HostSlice::from_mut_slice(&mut result_ref);
+
+    let mut stream = IcicleStream::create().unwrap();
+    let mut cfg = VecOpsConfig::default();
+    cfg.stream_handle = *stream;
+
+    test_utilities::test_set_main_device();
+    product_scalars(a_main, result_main, &cfg).unwrap();
+
+    test_utilities::test_set_ref_device();
+    product_scalars(a_main, result_ref, &cfg).unwrap();
+
+    assert_eq!(result_main.as_slice(), result_ref.as_slice());
+
+    stream
+        .destroy()
+        .unwrap();
+}
+
+pub fn check_vec_ops_scalars_add_scalar<F: FieldImpl>(test_size: usize)
+where
+    <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
+{
+    let a_main = F::Config::generate_random(1);
+    let b = F::Config::generate_random(test_size);
+    let mut result_main = vec![F::zero(); test_size];
+    let mut result_ref = vec![F::zero(); test_size];
+
+    let a_main = HostSlice::from_slice(&a_main);
+    let b = HostSlice::from_slice(&b);
+    let result_main = HostSlice::from_mut_slice(&mut result_main);
+    let result_ref = HostSlice::from_mut_slice(&mut result_ref);
+
+    let mut stream = IcicleStream::create().unwrap();
+    let mut cfg = VecOpsConfig::default();
+    cfg.stream_handle = *stream;
+    cfg.batch_size = 1;
+
+    test_utilities::test_set_main_device();
+    scalar_add(a_main, b, result_main, &cfg).unwrap();
+
+    test_utilities::test_set_ref_device();
+    scalar_add(a_main, b, result_ref, &cfg).unwrap();
+
+    assert_eq!(result_main.as_slice(), result_ref.as_slice());
+
+    stream
+        .destroy()
+        .unwrap();
+}
+
+pub fn check_vec_ops_scalars_sub_scalar<F: FieldImpl>(test_size: usize)
+where
+    <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
+{
+    let a_main = F::Config::generate_random(1);
+    let b = F::Config::generate_random(test_size);
+    let mut result_main = vec![F::zero(); test_size];
+    let mut result_ref = vec![F::zero(); test_size];
+
+    let a_main = HostSlice::from_slice(&a_main);
+    let b = HostSlice::from_slice(&b);
+    let result_main = HostSlice::from_mut_slice(&mut result_main);
+    let result_ref = HostSlice::from_mut_slice(&mut result_ref);
+
+    let mut stream = IcicleStream::create().unwrap();
+    let mut cfg = VecOpsConfig::default();
+    cfg.stream_handle = *stream;
+    cfg.batch_size = 1;
+
+    test_utilities::test_set_main_device();
+    scalar_sub(a_main, b, result_main, &cfg).unwrap();
+
+    test_utilities::test_set_ref_device();
+    scalar_sub(a_main, b, result_ref, &cfg).unwrap();
+
+    assert_eq!(result_main.as_slice(), result_ref.as_slice());
+
+    stream
+        .destroy()
+        .unwrap();
+}
+
+pub fn check_vec_ops_scalars_mul_scalar<F: FieldImpl>(test_size: usize)
+where
+    <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
+{
+    let a_main = F::Config::generate_random(1);
+    let b = F::Config::generate_random(test_size);
+    let mut result_main = vec![F::zero(); test_size];
+    let mut result_ref = vec![F::zero(); test_size];
+
+    let a_main = HostSlice::from_slice(&a_main);
+    let b = HostSlice::from_slice(&b);
+    let result_main = HostSlice::from_mut_slice(&mut result_main);
+    let result_ref = HostSlice::from_mut_slice(&mut result_ref);
+
+    let mut stream = IcicleStream::create().unwrap();
+    let mut cfg = VecOpsConfig::default();
+    cfg.stream_handle = *stream;
+    cfg.batch_size = 1;
+
+    test_utilities::test_set_main_device();
+    scalar_mul(a_main, b, result_main, &cfg).unwrap();
+
+    test_utilities::test_set_ref_device();
+    scalar_mul(a_main, b, result_ref, &cfg).unwrap();
+
+    assert_eq!(result_main.as_slice(), result_ref.as_slice());
+
+    stream
+        .destroy()
+        .unwrap();
+}
+
 pub fn check_vec_ops_scalars_accumulate<F: FieldImpl>(test_size: usize)
 where
     <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
@@ -205,6 +397,47 @@ where
     assert_eq!(result_main, result_ref);
 }
 
+pub fn check_slice<F: FieldImpl>()
+where
+    <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,
+{
+    let size_in: u64 = 1 << 10;
+    let offset: u64 = 10;
+    let stride: u64 = 3;
+    let size_out: u64 = ((size_in - offset) / stride) - 1;
+
+    let input_matrix = F::Config::generate_random(size_in as usize);
+    let mut result_main = vec![F::zero(); size_out as usize];
+    let mut result_ref = vec![F::zero(); size_out as usize];
+
+    let cfg = VecOpsConfig::default();
+    test_utilities::test_set_main_device();
+    slice(
+        HostSlice::from_slice(&input_matrix),
+        offset,
+        stride,
+        size_in,
+        size_out,
+        &cfg,
+        HostSlice::from_mut_slice(&mut result_main),
+    )
+    .unwrap();
+
+    test_utilities::test_set_ref_device();
+    slice(
+        HostSlice::from_slice(&input_matrix),
+        offset,
+        stride,
+        size_in,
+        size_out,
+        &cfg,
+        HostSlice::from_mut_slice(&mut result_ref),
+    )
+    .unwrap();
+
+    assert_eq!(result_main, result_ref);
+}
+
 pub fn check_bit_reverse<F: FieldImpl>()
 where
     <F as FieldImpl>::Config: VecOps<F> + GenerateRandom<F>,