diff --git a/docs/build_api.py b/docs/build_api.py
index a4c63dd..8d2c626 100644
--- a/docs/build_api.py
+++ b/docs/build_api.py
@@ -154,6 +154,7 @@ def build_index_page(groups):
             "all",
             "any",
             "count",
+            "dot",
         ],
         "Mathematical": [
             ("abs", "abs(const V&)"),
diff --git a/docs/guides.rst b/docs/guides.rst
index 72c2ead..e1a39fd 100644
--- a/docs/guides.rst
+++ b/docs/guides.rst
@@ -5,5 +5,7 @@ Guides
 
    guides/introduction.rst
    guides/promotion.rst
-   guides/prelude.rst
+   guides/memory.rst
+   guides/accuracy.rst
    guides/constant.rst
+   guides/example.md
diff --git a/docs/guides/accuracy.md b/docs/guides/accuracy.md
new file mode 100644
index 0000000..da7bb53
--- /dev/null
+++ b/docs/guides/accuracy.md
@@ -0,0 +1,112 @@
+# Accuracy Level
+
+For certain operations, there might be alternative versions available that provide better performance at the cost of lower accuracy.
+In other words, they are faster but also have a small error.
+
+## Fast Math
+
+For several operations in single precision (float32), there are "fast math" versions. These functions are faster since they are hardware accelerated, but are not IEEE compliant for all inputs.
+
+To use this functionality, use the `fast_*` functions from Kernel Float.
+
+```cpp
+kf::vec<float, 4> x = {1.0f, 2.0f, 3.0f, 4.0f};
+
+// Sine
+kf::vec<float, 4> a = kf::fast_sin(x);
+
+// Square root
+kf::vec<float, 4> b = kf::fast_sqrt(x);
+
+// Reciprocal `1/x`
+kf::vec<float, 4> c = kf::fast_rcp(x);
+
+// Division `a/b`
+kf::vec<float, 4> d = kf::fast_div(a, b);
+```
+
+These functions are only functional for 32-bit and 16-bit floats. 
+For other input types, the operation falls back to the regular version.
+
+## Approximate Math
+
+For 16-bit floats, several approximate functions are provided. 
+These use approximations (typically low-degree polynomials) to calculate rough estimates of the functions. 
+This can be very fast but also less accurate.
+
+
+To use this functionality, use the `approx_*` functions from Kernel Float. For other input types, the operation falls back to the `fast_*` variant.
+
+```cpp
+kf::vec<half, 4> x = {1.0, 2.0, 3.0, 4.0};
+
+// Sine
+kf::vec<half, 4> a = kf::approx_sin(x);
+
+// Square root
+kf::vec<half, 4> b = kf::approx_sqrt(x);
+
+// Reciprocal `1/x`
+kf::vec<half, 4> c = kf::approx_rcp(x);
+
+// Division `a/b`
+kf::vec<half, 4> d = kf::approx_div(a, b);
+```
+
+You can adjust the degree of approximation by supplying an integer template parameter:
+
+
+```cpp
+// Sine approximation with polynomial of degree 1
+kf::vec<half, 4> a = kf::approx_sin<1>(x);
+
+// Polynomial of degree 2
+kf::vec<half, 4> a = kf::approx_sin<2>(x);
+
+// Polynomial of degree 3
+kf::vec<half, 4> a = kf::approx_sin<3>(x);
+```
+
+## Tuning Accuracy Level
+
+Many functions in Kernel Float accept an additional Accuracy option as a template parameter. 
+This allows you to tune the accuracy level without changing the function name.
+
+There are four possible values for this parameter:
+
+- `kf::accurate_policy`: Use the most accurate version of the function available.
+- `kf::fast_policy`: Use the "fast math" version.
+- `kf::approx_policy<N>`: Use the approximate version with degree `N`.
+- `kf::default_policy`: Use a global default policy (see the next section).
+
+For example, consider this code:
+
+```cpp
+kf::vec<float, 2> input = {1.0f, 2.0f};
+
+// Use the default policy
+kf::vec<float, 2> a = kf::cos(input);
+
+// Use the default policy
+kf::vec<float, 2> b = kf::cos<kf::default_policy>(input);
+
+// Use the most accurate policy
+kf::vec<float, 2> c = kf::cos<kf::accurate_policy>(input);
+
+// Use the fastest policy
+kf::vec<float, 2> d = kf::cos<kf::fast_policy>(input);
+
+// Use the approximate policy
+kf::vec<float, 2> e = kf::cos<kf::approx_policy<3>>(input);
+
+// You can use aliases to define your own policy
+using my_own_policy = kf::fast_policy;
+kf::vec<float, 2> f = kf::cos<my_own_policy>(input);
+```
+
+## Setting `default_policy`
+
+By default, `kf::default_policy` is set to `kf::accurate_policy`.
+
+Set the preprocessor option `KERNEL_FLOAT_FAST_MATH=1` to change the default policy to `kf::fast_policy`.
+This will use fast math for all functions and data types that support it.
diff --git a/docs/guides/example.md b/docs/guides/example.md
new file mode 100644
index 0000000..62a80da
--- /dev/null
+++ b/docs/guides/example.md
@@ -0,0 +1,112 @@
+# Full CUDA example
+
+This page explains a CUDA program that estimates the value of pi using Kernel Float.
+
+
+## Overview
+
+The program calculates Pi by generating random points within a unit square and counting how many fall inside the unit circle inscribed within that square. The ratio of points inside the circle to the total number of points approximates Pi/4.
+
+The kernel is shown below:
+
+
+```c++
+namespace kf = kernel_float;
+
+using float_type = float;
+static constexpr int VECTOR_SIZE = 4;
+
+__global__ void calculate_pi_kernel(int nx, int ny, int* global_count) {
+    int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
+    int thread_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    kf::vec<int, VECTOR_SIZE> xi = thread_x * VECTOR_SIZE + kf::range<int, VECTOR_SIZE>();
+    kf::vec<int, VECTOR_SIZE> yi = thread_y;
+
+    kf::vec<float_type, VECTOR_SIZE> xf = kf::cast<float_type>(xi) / float_type(nx);
+    kf::vec<float_type, VECTOR_SIZE> yf = kf::cast<float_type>(yi) / float_type(ny);
+
+    kf::vec<float_type, VECTOR_SIZE> dist_squared = xf * xf + yf * yf;
+    kf::vec<float_type, VECTOR_SIZE> dist = kf::sqrt(dist_squared);
+
+    int n = kf::count(dist <= float_type(1));
+
+    if (n > 0) atomicAdd(global_count, n);
+}
+```
+
+
+## Code Explanation
+
+Let's go through the code step by step.
+
+```cpp
+// Alias `kernel_float` as `kf`
+namespace kf = kernel_float;
+```
+
+This creates an alias for `kernel_float`.
+
+```cpp
+// Define the float type and vector size
+using float_type = float;
+static constexpr int VECTOR_SIZE = 4;
+```
+
+Define `float_type` as an alias for `float` to make it easy to change precision if needed.
+The vector size is set to 4, meaning each thread will process 4 data points.
+
+```cpp
+__global__ void calculate_pi_kernel(int nx, int ny, int* global_count) {
+```
+
+The CUDA kernel. There are `nx` points along the x axis and `ny` points along the y axis.
+
+```cpp
+    int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
+    int thread_y = blockIdx.y * blockDim.y + threadIdx.y;
+```
+
+Compute the global x- and y-index of this thread.
+
+```cpp
+    kf::vec<int, VECTOR_SIZE> xi = thread_x * VECTOR_SIZE + kf::range<int, VECTOR_SIZE>();
+    kf::vec<int, VECTOR_SIZE> yi = thread_y;
+```
+
+Compute the points that this thread will process.
+The x coordinates start at `thread_x * VECTOR_SIZE` and then the vector `[0, 1, 2, ..., VECTOR_SIZE-1]`.
+The y coordinates are all `thread_y`.
+
+```cpp
+    kf::vec<float_type, VECTOR_SIZE> xf = kf::cast<float_type>(xi) / float_type(nx);
+    kf::vec<float_type, VECTOR_SIZE> yf = kf::cast<float_type>(yi) / float_type(ny);
+```
+
+Divide `xi` and `yi` by `nx` and `ny` to normalize them to `[0, 1]` range.
+
+```cpp
+    kf::vec<float_type, VECTOR_SIZE> dist_squared = xf * xf + yf * yf;
+```
+
+Compute the squared distance from the origin (0, 0) to each point from `xf`,`yf`.
+
+```cpp
+    kf::vec<float_type, VECTOR_SIZE> dist = kf::sqrt(dist_squared);
+```
+
+Take the element-wise square root.
+
+```cpp
+    int n = kf::count(dist <= float_type(1));
+```
+
+Count the number of points in the unit circle (i.e., for which the distance is less than 4).
+The expression `dist <= 1` returns a vector of booleans and `kf::count` counts the number of `true` values.
+
+```cpp
+atomicAdd(global_count, n);
+```
+
+Add `n` to the `global_count` variable.
+This must be done using an atomic operation since multiple thread will write this variable simultaneously.
diff --git a/docs/guides/introduction.md b/docs/guides/introduction.md
index e6a4434..31f44e4 100644
--- a/docs/guides/introduction.md
+++ b/docs/guides/introduction.md
@@ -1,56 +1,192 @@
-Getting started
-===============
+# Getting started
 
-Kernel Float is a header-only library that makes it easy to work with vector types and low-precision floating-point types, mainly focusing on CUDA kernel code.
+**Kernel Float** is a header-only library that makes it easy to work with vector types and low-precision floating-point types, mainly focusing on CUDA kernel code.
 
-Installation
-------------
+## Installation
 
-The easiest way to use the library is get the single header file from github:
+The easiest way to use the library is to get the single header file from GitHub:
 
 ```bash
 wget https://raw.githubusercontent.com/KernelTuner/kernel_float/main/single_include/kernel_float.h
 ```
 
-Next, include this file into your program.
-It is conventient to define a namespace alias `kf` to shorten the full name `kernel_float`.
+Next, include this file in your program. It is convenient to define a namespace alias `kf` to shorten the full name `kernel_float`:
 
-
-```C++
+```cpp
 #include "kernel_float.h"
 namespace kf = kernel_float;
 ```
 
+## Vector types
 
-Example C++ code
-----------------
+Kernel Float essentially offers a single data type `kernel_float::vec<T, N>` that stores `N` elements of type `T`. The simplest way to initialize a vector is using list-initialization:
 
-Kernel Float essentially offers a single data-type `kernel_float::vec<T, N>` that stores `N` elements of type `T`.
-This type can be initialized normally using list-initialization (e.g., `{a, b, c}`) and elements can be accessed using the `[]` operator.
-Operation overload is available to perform binary operations (such as `+`, `*`, and `&`), where the optimal intrinsic for the available types is selected automatically.
+```cpp
+kf::vec<float, 4> my_vector = {1.0f, 2.0f, 3.0f, 4.0f};
+```
 
-Many mathetical functions (like `log`, `sin`, `cos`) are also available, see the [API reference](../api) for the full list of functions.
-In some cases, certain operations might not be natively supported by the platform for the some floating-point type.
-In these cases, Kernel Float falls back to performing the operations in 32 bit precision.
+It is also possible to automatically derive the type using `make_vec`:
 
-The code below shows a very simple example of how to use Kernel Float:
+```cpp
+// The type will be vec<double, 3>
+auto a = kf::make_vec(1.0, 2.0, 3.0);
 
-```C++
-#include "kernel_float.h"
-namespace kf = kernel_float;
+// The type will be vec<int, 2>
+auto b = kf::make_vec(7, 7);
+
+// The type will be vec<bool, 4>
+auto c = kf::make_vec(true, true, false, true);
+
+// This does not compile!
+auto d = kf::make_vec();
+```
+
+There are also many helper methods available to generate vectors; see the [API reference](../api). Some examples are `range`, `fill`, `ones`, and `zeros`.
+
+```cpp
+// Generates [0, 1, 2, 3]
+kf::vec<int, 4> a = kf::range<int, 4>();
+
+// Generates [42.0, 42.0, 42.0, 42.0]
+kf::vec<double, 4> b = kf::fill<4>(42.0);
+
+// Generates [0, 0, 0, 0]
+kf::vec<int, 4> c = kf::zeros<int, 4>();
+
+// Generates [true, true, true, true]
+kf::vec<bool, 4> d = kf::ones<bool, 4>();
+```
+
+You can also use the `*_like` functions to generate a vector based on another vector:
+
+```cpp
+// Generates [1.0, 2.0, 3.0, 4.0]
+kf::vec<float, 4> a = {1.0f, 2.0f, 3.0f, 4.0f};
+
+// Generates [0.0, 0.0, 0.0, 0.0]
+kf::vec<float, 4> b = kf::zeros_like(a);
+
+// Generates [1.0, 1.0, 1.0, 1.0]
+kf::vec<float, 4> c = kf::ones_like(a);
+```
+
+## Accessing elements
+
+Accessing elements can be done using the regular `[]` operator.
+
+```cpp
+// Generate [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
+kf::vec<float, 6> a = kf::range<float, 6>();
+
+// Returns 2.0
+float x = a[2];
+
+// Set element at 2 to 42.0
+a[2] = 42.0;
+
+// Returns 42.0
+float y = a[2];
+```
+
+You can get a pointer to the vector buffer by calling `data`:
+
+```cpp
+// Generate vector
+kf::vec<float, 4> v = {1.0f, 2.0f, 3.0f, 4.0f};
+
+float* address = v.data();
+
+// Set element at 0 to element at 1
+address[0] = address[1];
+```
 
-int main() {
-  using Type = float;
-  const int N = 8;
+Iteration can be done by using a regular for-loop:
 
-  kf::vec<int, N> i = kf::range<int, N>();
-  kf::vec<Type, N> x = kf::cast<Type>(i);
-  kf::vec<Type, N> y = x * kf::sin(x);
-  Type result = kf::sum(y);
-  printf("result=%f", double(result));
+```cpp
+kf::vec<float, 4> vector = {1.0f, 2.0f, 3.0f, 4.0f};
 
-  return EXIT_SUCCESS;
+for (float x : vector) {
+  printf("x=%f\n", x);
 }
 ```
 
-Notice how easy it would be to change the floating-point type `Type` or the vector length `N` without affecting the rest of the code.
+## Operator overloading
+
+The **arithmetic** operators `+`, `-`, `*`, `/`, and `%` are overloaded to perform element-wise operations.
+
+```cpp
+// Generate [1.0f, 2.0f, 3.0f]
+kf::vec<float, 3> a = {1.0f, 2.0f, 3.0f};
+
+// Generate [1.0f, 1.0f, 1.0f]
+kf::vec<float, 3> b = kf::ones<float, 3>();
+
+// Add them together to create [2.0f, 3.0f, 4.0f]
+kf::vec<float, 3> c = a + b;
+```
+
+The **comparison** operators `<`, `>`, `==`, `!=`, `<=`, `>=` are overloaded to perform element-wise operations. Note that the returned value is a vector containing 0s (`false`) and 1s (`true`). The element type and vector length will match the inputs.
+
+```cpp
+// Generate doubles
+kf::vec<double, 5> a = {4.0, -100.0, 0.0, 0.5, -3.0};
+
+// Generate zeros
+kf::vec<double, 5> zeros = kf::zeros_like(a);
+
+// Generates [false, true, false, false, true]
+kf::vec<bool, 5> result = a < zeros;
+```
+
+The **logical** operators `&&` and `||` are NOT overloaded. This is because there is no method to simulate the short-circuiting behavior. Instead, the operators `!` (not), `&` (and), `|` (or), and `^` (xor) are overloaded to behave as logical operators.
+
+```cpp
+// Generate doubles
+kf::vec<double, 5> a = {4.0, -100.0, 0.0, 0.5, -3.0};
+
+// Generate zeros and ones
+kf::vec<double, 5> zeros = kf::zeros_like(a);
+kf::vec<double, 5> ones = kf::ones_like(a);
+
+// Generates [false, false, true, true, false]
+kf::vec<bool, 5> result = (a >= zeros) & (a <= ones);
+
+// Using `&&` instead of `&` results in a compilation error!
+// kf::vec<bool, 5> fail = (a >= zeros) && (a <= ones);
+```
+
+If the two inputs of a binary operator do not match (either element type and/or vector length), Kernel Float will automatically perform **type promotion** (described on the page [Type Promotion](promotion)). This allows our example to be simplified to just:
+
+```cpp
+// Generate doubles
+kf::vec<double, 5> a = {4.0, -100.0, 0.0, 0.5, -3.0};
+
+// Generates [false, false, true, true, false]
+kf::vec<bool, 5> result = (a >= 0.0) & (a <= 1.0);
+```
+
+## Mathematical functions
+
+Many mathematical functions (like `log`, `sin`, `cos`) are also available; see the [API reference](../api) for the full list of functions. These always work element-wise:
+
+```cpp
+// Input vector
+kf::vec<float, 4> x = {0.0f, 1.0f, 2.0f, 3.0f};
+
+// Gives [0.0, 0.84147098, 0.9092974, 0.14112001]
+kf::vec<float, 4> a = kf::sin(x);
+
+// Gives [1.0, 0.54030231, -0.41614684, -0.9899925]
+kf::vec<float, 4> b = kf::cos(x);
+
+// Gives [0.0, 1.0, 1.4142135, 1.7320508]
+kf::vec<float, 4> c = kf::sqrt(x);
+
+// Gives [1.0, 2.7182818, 7.3890561, 20.085537]
+kf::vec<float, 4> d = kf::exp(x);
+
+// Gives [0, 0, 0, 0]
+kf::vec<bool, 4> e = kf::isnan(x);
+```
+
+In some cases, certain operations might not be natively supported by the platform for some floating-point types. In these cases, Kernel Float falls back to performing the operations in 32-bit precision.
diff --git a/docs/guides/memory.md b/docs/guides/memory.md
new file mode 100644
index 0000000..2ddc7a2
--- /dev/null
+++ b/docs/guides/memory.md
@@ -0,0 +1,112 @@
+# Memory Operations
+
+A common problem in vector programming is that a "simple" pointer (such as `float*`) is provided, but you would like to read a vector of elements. This page describes solutions to that problem.
+
+## Using `read`/`write`
+
+The simplest solution is to use `read` and `write` to access `N` consecutive elements.
+
+```cpp
+std::vector<float> buffer = {1.0f, 2.0f, 3.0f, 4.0f};
+
+float* pointer = buffer.data();
+
+// Read 2 elements: buffer[0], buffer[1]
+kf::vec<float, 2> a = kf::read<2>(pointer);
+
+// Read 3 elements: buffer[1], buffer[2], buffer[3]
+kf::vec<float, 3> b = kf::read<3>(pointer + 1);
+
+// Write 2 elements: buffer[0], buffer[1]
+kf::write(pointer, kf::vec<float, 2>(100.0f, 200.0f));
+
+// Write 3 elements: buffer[1], buffer[2], buffer[3]
+kf::write(pointer + 1, kf::vec<float, 3>(0.0f, 0.0f, 0.0f));
+```
+
+## Using `read_aligned`/`write_aligned`
+
+For small data types, it can be highly beneficial to use **aligned** memory operations. 
+These allow the compiler to read/write the elements more efficiently but require that the accessed pointer is **aligned** to a certain vector size.
+
+```cpp
+// Read 2 elements: buffer[0], buffer[1]
+kf::vec<float, 2> a = kf::read_aligned<2>(pointer);
+
+// Read 2 elements: buffer[2], buffer[3]
+kf::vec<float, 2> b = kf::read_aligned<2>(pointer + 2);
+
+// This is not allowed! `pointer+1` is not aligned to 2 elements!
+// kf::vec<float, 2> b = kf::read_aligned<2>(pointer + 1);
+
+// Write 2 elements: buffer[0], buffer[1]
+kf::write_aligned<2>(pointer, kf::vec<float, 2>(100.0f, 200.0f));
+
+// Again, this is not allowed! `pointer+1` is not aligned to 3 elements!
+// kf::write_aligned<3>(pointer + 1, kf::vec<float, 3>(0.0f, 0.0f, 0.0f));
+```
+
+Note that Kernel Float does **not** check the alignment, not at compile-time and not at runtime. 
+Using an unaligned address results in **undefined behavior**: either a runtime crash, miscompilation, or invalid results.
+
+## Using `vec_ptr<T, N>`
+
+`kf::vec_ptr<T, N>` is a data type that wraps a regular `T*` (or `const T*`) pointer and allows easy accessing of elements as a vector using aligned memory operations.
+
+For example, given `vec_ptr<float, 4> x`, reading `x[10]` returns a vector containing `{buffer[40], buffer[41], buffer[42], buffer[43]}`.
+Each index advances by `N` elements because you're working with vectors of size `N`.
+
+The code below shows an example:
+
+```cpp
+std::vector<float> buffer = {/* some data */};
+float* pointer = buffer.data();
+
+kf::vec_ptr<float, 2> x = kf::assert_aligned<2>(pointer);
+
+// Get the elements {buffer[20], buffer[21]}
+kf::vec<float, 2> a = x[10];
+
+// Set the elements at buffer[20] and buffer[21].
+x(10) = kf::make_vec(42.0f, 42.0f);
+```
+
+In this example:
+
+* `kf::vec_ptr<float, 2>` wraps the `float*` pointer, allowing vectorized access to the data.
+* `x[10]` accesses the elements at positions `20` and `21` (`10 * 2` and `10 * 2 + 1`).
+* `x(10)` provides a way to write to the data at the same positions.
+
+### Handling Different Storage Types
+
+A notable feature of `vec_ptr` is its ability to interact with data stored in a different underlying type than the one you operate with. 
+This is achieved by specifying a third template argument `U` in `vec_ptr<T, N, U>`.
+
+The `vec_ptr` class automatically handles the necessary type casting between `U` and `T`.
+A cast from `U` to `T` is automatically inserted after each read and from `T` to `U` before each write.
+
+For example, suppose your data is stored in half precision, but you want to read and write it using double precision. 
+
+
+```cpp
+std::vector<half> buffer = {/* some data */};
+half* pointer = buffer.data();
+
+// Create vec_ptr
+kf::vec_ptr<half, 2> x = kf::assert_aligned<2>(pointer);
+
+// Interact with the data in double precision
+kf::vec_ptr<double, 2, half> y = x;
+
+// Get the elements {buffer[20], buffer[21]} in double precision
+kf::vec<double, 2> a = y[10];
+
+// Set the elements at buffer[20] and buffer[21].
+y(10) = kf::make_vec(42.0, 42.0);
+```
+
+In this example:
+
+* `x` is a `vec_ptr<half, 2>` pointing to the data stored in half precision.
+* `y` is a `vec_ptr<double, 2, half>` that allows you to interact with the data as double precision, even though it is stored as half.
+
diff --git a/docs/guides/prelude.md b/docs/guides/prelude.md
deleted file mode 100644
index 449dc88..0000000
--- a/docs/guides/prelude.md
+++ /dev/null
@@ -1,42 +0,0 @@
-Using `kernel_float::prelude`
-===
-
-When working with Kernel Float, you'll find that you need to prefix every function and type with the `kernel_float::...` prefix. 
-This can be a bit cumbersome. 
-It's strongly discouraged not to dump the entire `kernel_float` namespace into the global namespace (with `using namespace kernel_float`) since
-many symbols in Kernel Float may clash with global symbols, causing conflicts and issues.
-
-To work around this, the library provides a handy `kernel_float::prelude` namespace. This namespace contains a variety of useful type and function aliases that won't conflict with global symbols.
-
-To make use of it, use the following code:
-
-
-```C++
-#include "kernel_float.h"
-using namespace kernel_float::prelude;
-
-// You can now use aliases like `kf`, `kvec`, `kint`, etc.
-```
-
-The prelude defines many aliases, include the following:
-
-| Prelude name | Full name |
-|---|---|
-| `kf` | `kernel_float` |
-| `kvec<T, N>`  | `kernel_float::vec<T, N>`  |
-| `into_kvec(v)`  | `kernel_float::into_vec(v)`  |
-| `make_kvec(a, b, ...)`  | `kernel_float::make_vec(a, b, ...)`  |
-| `kvec2<T>`, `kvec3<T>`, ...  | `kernel_float::vec<T, 2>`, `kernel_float::vec<T, 3>`, ...  |
-| `kint<N>` | `kernel_float::vec<int, N>` |
-| `kint2`, `kint3`, ...  | `kernel_float::vec<int, 2>`, `kernel_float::vec<int, 3>`, ...  |
-| `klong<N>` | `kernel_float::vec<long, N>` |
-| `klong2`, `klong3`, ...  | `kernel_float::vec<long, 2>`, `kernel_float::vec<long, 3>`, ...  |
-| `kbfloat16x<N>` | `kernel_float::vec<bfloat16, N>` |
-| `kbfloat16x2`, `kbfloat16x3`, ...  | `kernel_float::vec<bfloat16, 2>`, `kernel_float::vec<bfloat16, 3>`, ...  |
-| `khalf<N>` | `kernel_half::vec<half, N>` |
-| `khalf2`, `khalf3`, ...  | `kernel_half::vec<half, 2>`, `kernel_half::vec<half, 3>`, ...  |
-| `kfloat<N>` | `kernel_float::vec<float, N>` |
-| `kfloat2`, `kfloat3`, ...  | `kernel_float::vec<float, 2>`, `kernel_float::vec<float, 3>`, ...  |
-| `kdouble<N>` | `kernel_float::vec<double, N>` |
-| `kdouble2`, `kdouble3`, ...  | `kernel_float::vec<double, 2>`, `kernel_float::vec<double, 3>`, ...  |
-| ... | ... |
diff --git a/docs/guides/promotion.md b/docs/guides/promotion.md
new file mode 100644
index 0000000..338ddb6
--- /dev/null
+++ b/docs/guides/promotion.md
@@ -0,0 +1,127 @@
+# Casting and Type Promotion
+
+This section is on data type casting and implicit type promotion.
+
+## Type Casting
+
+You can use the function `cast` to explicitly change the element type of a vector:
+
+```cpp
+// a = [0, 1, 2, 3]
+kf::vec<int, 4> a = kf::range<int, 4>();
+
+// b = [0, 1, 2, 3]
+kf::vec<short, 4> b = kf::cast<short>(a);
+
+// c = [0.0f, 1.0f, 2.0f, 3.0f]
+kf::vec<float, 4> c = kf::cast<float>(a);
+
+// d = [false, true, true, true]
+kf::vec<bool, 4> d = kf::cast<bool>(a);
+```
+
+Kernel Float allows **implicit** type conversions between vector types **only** for **widening** casts (i.e., converting a smaller type into a larger type).
+
+```cpp
+// a = [0, 1, 2, 3]
+kf::vec<int, 4> a = kf::range<int, 4>();
+
+// This is allowed: int -> long
+kf::vec<long, 4> b = a;
+
+// This is also allowed: int -> float
+kf::vec<float, 4> c = a;
+```
+
+Kernel Float does **not** perform implicit conversion for narrowing casts; this requires an explicit cast:
+
+```cpp
+// a = [0, 1, 2, 3]
+kf::vec<double, 4> a = kf::range<double, 4>();
+
+// This is NOT allowed, use `kf::cast<float>(a)`
+kf::vec<float, 4> b = a; // ERROR
+
+// This is NOT allowed, use `kf::cast<int>(a)`
+kf::vec<int, 4> c = a; // ERROR
+
+// This is NOT allowed, use `kf::cast<bool>(a)`
+kf::vec<bool, 4> d = a; // ERROR
+```
+
+Alternatively, it is possible to use `kf::cast_to` on the left side of the assignment to perform a cast:
+
+```cpp
+// a = [0, 1, 2, 3]
+kf::vec<double, 4> a = kf::range<double, 4>();
+
+// Define b
+kf::vec<float, 4> b;
+
+// This is equivalent to `b = kf::cast<float>(a)`
+kf::cast_to(b) = a;
+```
+
+## Type Promotion
+
+When performing operations between vectors of different types or sizes, Kernel Float automatically promotes types to ensure compatibility. This process is known as **type promotion**.
+
+
+
+Consider the following example. What should the type of `c` be in this case?
+
+```cpp
+kf::vec<float, 4> a = {1.0f, 2.0f, 3.0f, 4.0f};
+kf::vec<double, 4> b = {10.0, 20.0, 30.0, 40.0};
+
+kf::vec<???, 4> c = a + b;
+```
+
+Another example is the following snippet. Again, what should the type of `c` be?
+
+```cpp
+kf::vec<float, 4> x = {1.0f, 2.0f, 3.0f, 4.0f};
+int factor = 2;
+
+kf::vec<???, ???> c = x * factor;
+```
+
+### How Type Promotion Works
+Type promotion in Kernel Float unifies arguments for binary (and ternary) operations through the following steps:
+
+* **Vectorization**: Each non-vector argument is converted into a vector using the `into_vec` function.
+
+* **Length Unification**: All arguments must have the same length N or length 1. Vectors of length 1 are broadcasted to match length N.
+
+* **Type Unification**: The element types are promoted to a common type based on the following promotion rules.
+
+### Promotion Rules
+The rules for element type promotion in Kernel Float are slightly different from standard C++. Here's a summary:
+
+* **Boolean Types**: If one of the types is bool, the result type is the other type.
+
+* **Floating-Point and Integer**: If one type is floating-point and the other is an integer (signed or unsigned), the result is the floating-point type.
+
+* **Floating-Point Types**: If both are floating-point types, the larger (wider) type is chosen.
+Exception: Combining half and bfloat16 results in float.
+
+* **Integer Types**: If both are integers of the same signedness (both signed or both unsigned), the larger type is chosen.
+Combining a signed integer and an unsigned integer is not allowed.
+
+### Overview
+
+The following table summarizes the type promotion rules. The labels used are:
+
+- `b`: boolean
+- `iN`: signed integer of `N` bits (e.g., `int`, `long`)
+- `uN`: unsigned integer of `N` bits (e.g., `unsigned int`, `size_t`)
+- `fN`: floating-point type of `N` bits (e.g., `float`, `double`)
+- `bf16`: bfloat16 floating-point format.
+
+```{csv-table} Type Promotion Rules.
+:file: promotion_table.csv
+```
+
+.. csv-table:: Type Promotion Rules.
+   :file: promotion_table.csv
+
diff --git a/docs/guides/promotion.rst b/docs/guides/promotion.rst
deleted file mode 100644
index cf48d9e..0000000
--- a/docs/guides/promotion.rst
+++ /dev/null
@@ -1,34 +0,0 @@
-Type Promotion
-==============
-
-For operations that involve two (or more) input arguments, ``kernel_float`` will first convert the inputs into a common type before applying the operation.
-For example, when adding ``vec<int, N>`` to ``vec<float, N>``, both arguments must first be converted into a ``vec<float, N>``.
-
-This procedure is called "type promotion" and is implemented as follows.
-Initially, every argument is transformed into a vector using the ``into_vec`` function
-Next, all arguments must have length ``N`` or length ``1``, where vectors of length ``1`` are repeated to become length ``N``.
-Finally, the vector element types are promoted into a shared type.
-
-The rules for element type promotion in ``kernel_float`` are slightly different than in regular C++.
-In a nutshell, for two element types, the promotion rules can be summarized as follows:
-
-* If one of the types is ``bool``, the result is the other type.
-* If one type is a floating-point and the other is an integer (signed or unsigned), the outcome is the floating-point type.
-* If both are floating-point types, the largest of the two is chosen. An exception is combining ``half`` and ``bfloat16``, which results in ``float``.
-* If both types are integer types of the same signedness, the largest of the two is chosen.
-* Combining a signed integer and unsigned integer type is not allowed.
-
-Overview
---------
-
-The type promotion rules are shown in the table below.
-The labels are as follows:
-
-* ``b``: boolean
-* ``iN``: signed integer of ``N`` bits (e.g., ``int``, ``long``)
-* ``uN``: unsigned integer of ``N`` bits (e.g., ``unsigned int``, ``size_t``)
-* ``fN``: floating-point type of ``N`` bits (e.g., ``float``, ``double``)
-* ``bf16``: bfloat16 floating-point format.
-
-.. csv-table:: Type Promotion Rules.
-   :file: promotion_table.csv
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 145f80b..239fba5 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,2 +1,3 @@
+add_subdirectory(pi)
 add_subdirectory(vector_add)
 add_subdirectory(vector_add_tiling)
diff --git a/examples/pi/CMakeLists.txt b/examples/pi/CMakeLists.txt
new file mode 100644
index 0000000..6a767d3
--- /dev/null
+++ b/examples/pi/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 3.17)
+
+set (PROJECT_NAME kernel_float_pi)
+project(${PROJECT_NAME} LANGUAGES CXX CUDA)
+set (CMAKE_CXX_STANDARD 17)
+
+add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
+target_link_libraries(${PROJECT_NAME} kernel_float)
+set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
+
+find_package(CUDA REQUIRED)
+target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
diff --git a/examples/pi/main.cu b/examples/pi/main.cu
new file mode 100644
index 0000000..58edd8b
--- /dev/null
+++ b/examples/pi/main.cu
@@ -0,0 +1,111 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "kernel_float.h"
+
+#define CUDA_CHECK(call)                                     \
+    do {                                                     \
+        cudaError_t __err = call;                            \
+        if (__err != cudaSuccess) {                          \
+            fprintf(                                         \
+                stderr,                                      \
+                "CUDA error at %s:%d code=%d(%s) \"%s\" \n", \
+                __FILE__,                                    \
+                __LINE__,                                    \
+                __err,                                       \
+                cudaGetErrorString(__err),                   \
+                #call);                                      \
+            exit(EXIT_FAILURE);                              \
+        }                                                    \
+    } while (0)
+
+// Alias `kernel_float` as `kf`
+namespace kf = kernel_float;
+
+// Define the float type and vector size
+using float_type = float;
+static constexpr int VECTOR_SIZE = 4;
+
+__global__ void calculate_pi_kernel(int nx, int ny, int* global_count) {
+    // Calculate the global x and y indices for this thread within the grid
+    int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
+    int thread_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    // Calculate the x and y coordinates as integers.
+    // The x coordinates are: [thread_x * VECTOR_SIZE, thread_x * VECTOR_SIZE + 1, ...]
+    // The y coordinates are: [thread_y,               thread_y,                   ...]
+    kf::vec<int, VECTOR_SIZE> xi = thread_x * VECTOR_SIZE + kf::range<int, VECTOR_SIZE>();
+    kf::vec<int, VECTOR_SIZE> yi = thread_y;
+
+    // Normalize the integers to values between 0 and 1.
+    kf::vec<float_type, VECTOR_SIZE> xf = kf::cast<float_type>(xi) / float_type(nx);
+    kf::vec<float_type, VECTOR_SIZE> yf = kf::cast<float_type>(yi) / float_type(ny);
+
+    // Compute the squared distance to the origin and then take the
+    // square root to get the distance to the origin.
+    kf::vec<float_type, VECTOR_SIZE> dist_squared = xf * xf + yf * yf;
+    kf::vec<float_type, VECTOR_SIZE> dist = kf::sqrt(dist_squared);
+
+    // Count the number of points within the unit circle.
+    // The expression `dist <= 1` returns a boolean vector
+    // and `kf::count` counts how many elements are `true`.
+    int n = kf::count(dist <= float_type(1));
+
+    // Atomically add 'n' to 'global_count'
+    atomicAdd(global_count, n);
+}
+
+double calculate_pi(int nx, int ny) {
+    // Allocate memory on the device (GPU) for 'global_count' to accumulate the count of points inside the circle
+    int* d_global_count;
+    CUDA_CHECK(cudaMalloc(&d_global_count, sizeof(int)));
+
+    // Initialize the device memory to zero
+    CUDA_CHECK(cudaMemset(d_global_count, 0, sizeof(int)));
+
+    // Each thread processes 'VECTOR_SIZE' points in the x-direction
+    int num_threads_x = (nx + VECTOR_SIZE - 1) / VECTOR_SIZE;
+
+    // Define the dimensions of each thread block (number of threads per block)
+    dim3 block_size(16, 16);  // Each block contains 16 threads in x and y directions
+
+    // Calculate the number of blocks needed in the grid to cover all threads
+    dim3 grid_size(
+        (num_threads_x + block_size.x - 1) / block_size.x,  // Number of blocks in x-direction
+        (ny + block_size.y - 1) / block_size.y  // Number of blocks in y-direction
+    );
+
+    // Launch the kernel on the GPU with the calculated grid and block dimensions
+    calculate_pi_kernel<<<grid_size, block_size>>>(nx, ny, d_global_count);
+
+    // Check for any errors during kernel launch (asynchronous)
+    CUDA_CHECK(cudaGetLastError());
+
+    // Wait for the kernel to finish executing and check for errors (synchronization point)
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    // Copy the result from device memory back to host memory
+    int h_global_count = 0;  // Host variable to store the count
+    CUDA_CHECK(cudaMemcpy(&h_global_count, d_global_count, sizeof(int), cudaMemcpyDeviceToHost));
+
+    // Free the allocated device memory
+    CUDA_CHECK(cudaFree(d_global_count));
+
+    // Calculate the estimated value of Pi using the ratio of points inside the circle to the total points
+    int total_points = nx * ny;
+    double pi_estimate = 4.0 * (double(h_global_count) / total_points);
+
+    return pi_estimate;
+}
+
+int main() {
+    CUDA_CHECK(cudaSetDevice(0));
+
+    for (int n = 1; n <= 16384; n *= 2) {
+        double pi = calculate_pi(n, n);
+
+        printf("nx=%d ny=%d pi=%f\n", n, n, pi);
+    }
+
+    return EXIT_SUCCESS;
+}
diff --git a/examples/vector_add/main.cu b/examples/vector_add/main.cu
index 48a9a45..705cf8c 100644
--- a/examples/vector_add/main.cu
+++ b/examples/vector_add/main.cu
@@ -4,7 +4,7 @@
 #include <vector>
 
 #include "kernel_float.h"
-using namespace kernel_float::prelude;
+namespace kf = kernel_float;
 
 void cuda_check(cudaError_t code) {
     if (code != cudaSuccess) {
diff --git a/examples/vector_add_tiling/main.cu b/examples/vector_add_tiling/main.cu
index af7aa94..ddd28ab 100644
--- a/examples/vector_add_tiling/main.cu
+++ b/examples/vector_add_tiling/main.cu
@@ -5,7 +5,7 @@
 
 #include "kernel_float.h"
 #include "kernel_float/tiling.h"
-using namespace kernel_float::prelude;
+namespace kf = kernel_float;
 
 void cuda_check(cudaError_t code) {
     if (code != cudaSuccess) {
diff --git a/include/kernel_float.h b/include/kernel_float.h
index 53df30b..ee098fc 100644
--- a/include/kernel_float.h
+++ b/include/kernel_float.h
@@ -4,13 +4,13 @@
 #include "kernel_float/base.h"
 #include "kernel_float/bf16.h"
 #include "kernel_float/binops.h"
+#include "kernel_float/constant.h"
 #include "kernel_float/conversion.h"
 #include "kernel_float/fp16.h"
 #include "kernel_float/iterate.h"
 #include "kernel_float/macros.h"
 #include "kernel_float/memory.h"
 #include "kernel_float/meta.h"
-#include "kernel_float/prelude.h"
 #include "kernel_float/reduce.h"
 #include "kernel_float/triops.h"
 #include "kernel_float/unops.h"
diff --git a/include/kernel_float/apply.h b/include/kernel_float/apply.h
index cf63787..3a7e02c 100644
--- a/include/kernel_float/apply.h
+++ b/include/kernel_float/apply.h
@@ -118,43 +118,66 @@ broadcast_like(const V& input, const R& other) {
 
 namespace detail {
 
-template<size_t N>
-struct apply_recur_impl;
-
 template<typename F, size_t N, typename Output, typename... Args>
 struct apply_impl {
-    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {
-        apply_recur_impl<N>::call(fun, result, inputs...);
+    KERNEL_FLOAT_INLINE static void call(F fun, Output* output, const Args*... args) {
+#pragma unroll
+        for (size_t i = 0; i < N; i++) {
+            output[i] = fun(args[i]...);
+        }
     }
 };
 
-template<size_t N>
-struct apply_recur_impl {
-    static constexpr size_t K = round_up_to_power_of_two(N) / 2;
+template<typename F, size_t N, typename Output, typename... Args>
+struct apply_fastmath_impl: apply_impl<F, N, Output, Args...> {};
+}  // namespace detail
 
-    template<typename F, typename Output, typename... Args>
-    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {
-        apply_impl<F, K, Output, Args...>::call(fun, result, inputs...);
-        apply_impl<F, N - K, Output, Args...>::call(fun, result + K, (inputs + K)...);
-    }
+struct accurate_policy {
+    template<typename F, size_t N, typename Output, typename... Args>
+    using type = detail::apply_impl<F, N, Output, Args...>;
 };
 
-template<>
-struct apply_recur_impl<0> {
-    template<typename F, typename Output, typename... Args>
-    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {}
+struct fast_policy {
+    template<typename F, size_t N, typename Output, typename... Args>
+    using type = detail::apply_fastmath_impl<F, N, Output, Args...>;
 };
 
-template<>
-struct apply_recur_impl<1> {
-    template<typename F, typename Output, typename... Args>
-    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {
-        result[0] = fun(inputs[0]...);
+#ifdef KERNEL_FLOAT_POLICY
+using default_policy = KERNEL_FLOAT_POLICY;
+#else
+using default_policy = accurate_policy;
+#endif
+
+namespace detail {
+
+template<typename Policy, typename F, size_t N, typename Output, typename... Args>
+struct map_policy_impl {
+    static constexpr size_t packet_size = preferred_vector_size<Output>::value;
+    static constexpr size_t remainder = N % packet_size;
+
+    KERNEL_FLOAT_INLINE static void call(F fun, Output* output, const Args*... args) {
+        if constexpr (N / packet_size > 0) {
+#pragma unroll
+            for (size_t i = 0; i < N - remainder; i += packet_size) {
+                Policy::template type<F, packet_size, Output, Args...>::call(
+                    fun,
+                    output + i,
+                    (args + i)...);
+            }
+        }
+
+        if constexpr (remainder > 0) {
+#pragma unroll
+            for (size_t i = N - remainder; i < N; i++) {
+                Policy::template type<F, 1, Output, Args...>::call(fun, output + i, (args + i)...);
+            }
+        }
     }
 };
 
 template<typename F, size_t N, typename Output, typename... Args>
-struct apply_fastmath_impl: apply_impl<F, N, Output, Args...> {};
+using map_impl = map_policy_impl<default_policy, F, N, Output, Args...>;
+
 }  // namespace detail
 
 template<typename F, typename... Args>
@@ -171,40 +194,13 @@ using map_type =
  * vec<float, 4> squared = map([](auto x) { return x * x; }, input); // [1.0f, 4.0f, 9.0f, 16.0f]
  * ```
  */
-template<typename F, typename... Args>
+template<typename Accuracy = default_policy, typename F, typename... Args>
 KERNEL_FLOAT_INLINE map_type<F, Args...> map(F fun, const Args&... args) {
     using Output = result_t<F, vector_value_type<Args>...>;
     using E = broadcast_vector_extent_type<Args...>;
-    vector_storage<Output, E::value> result;
-
-    // Use the `apply_fastmath_impl` if KERNEL_FLOAT_FAST_MATH is enabled
-#if KERNEL_FLOAT_FAST_MATH
-    using apply_impl = detail::apply_fastmath_impl<F, E::value, Output, vector_value_type<Args>...>;
-#else
-    using apply_impl = detail::apply_impl<F, E::value, Output, vector_value_type<Args>...>;
-#endif
-
-    apply_impl::call(
-        fun,
-        result.data(),
-        (detail::broadcast_impl<vector_value_type<Args>, vector_extent_type<Args>, E>::call(
-             into_vector_storage(args))
-             .data())...);
-
-    return result;
-}
-
-/**
- * Apply the function `F` to each element from the vector `input` and return the results as a new vector. This
- * uses fast-math if available for the given function `F`, otherwise this function behaves like `map`.
- */
-template<typename F, typename... Args>
-KERNEL_FLOAT_INLINE map_type<F, Args...> fast_map(F fun, const Args&... args) {
-    using Output = result_t<F, vector_value_type<Args>...>;
-    using E = broadcast_vector_extent_type<Args...>;
-    vector_storage<Output, E::value> result;
+    vector_storage<Output, extent_size<E>> result;
 
-    detail::apply_fastmath_impl<F, E::value, Output, vector_value_type<Args>...>::call(
+    detail::map_policy_impl<Accuracy, F, extent_size<E>, Output, vector_value_type<Args>...>::call(
         fun,
         result.data(),
         (detail::broadcast_impl<vector_value_type<Args>, vector_extent_type<Args>, E>::call(
diff --git a/include/kernel_float/base.h b/include/kernel_float/base.h
index e32ac41..403bceb 100644
--- a/include/kernel_float/base.h
+++ b/include/kernel_float/base.h
@@ -45,7 +45,6 @@ struct alignas(Alignment) aligned_array<T, 1, Alignment> {
 };
 
 template<typename T, size_t Alignment>
-
 struct aligned_array<T, 0, Alignment> {
     KERNEL_FLOAT_INLINE
     T* data() {
@@ -85,10 +84,12 @@ struct extent;
 
 template<size_t N>
 struct extent<N> {
-    static constexpr size_t value = N;
-    static constexpr size_t size = N;
+    static constexpr size_t number_of_elements = N;
 };
 
+template<typename E>
+static constexpr size_t extent_size = E::number_of_elements;
+
 namespace detail {
 // Indicates that elements of type `T` offer less precision than floats, thus operations
 // on elements of type `T` can be performed by upcasting them to ` float`.
@@ -215,7 +216,7 @@ KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong1, ulonglong2, ulon
 KERNEL_FLOAT_DEFINE_VECTOR_TYPE(float, float1, float2, float3, float4)
 KERNEL_FLOAT_DEFINE_VECTOR_TYPE(double, double1, double2, double3, double4)
 
-template<typename T, typename E, typename S = vector_storage<T, E::size>>
+template<typename T, typename E, typename S = vector_storage<T, E::number_of_elements>>
 struct vector;
 
 template<typename T, typename E, typename S>
@@ -224,11 +225,16 @@ struct into_vector_impl<vector<T, E, S>> {
     using extent_type = E;
 
     KERNEL_FLOAT_INLINE
-    static vector_storage<T, E::value> call(const vector<T, E, S>& input) {
+    static vector_storage<T, extent_size<E>> call(const vector<T, E, S>& input) {
         return input.storage();
     }
 };
 
+template<typename T>
+struct preferred_vector_size {
+    static constexpr size_t value = 1;
+};
+
 template<typename V>
 struct vector_traits;
 
@@ -247,13 +253,13 @@ template<typename V>
 using vector_extent_type = typename into_vector_impl<V>::extent_type;
 
 template<typename V>
-static constexpr size_t vector_extent = vector_extent_type<V>::value;
+static constexpr size_t vector_size = extent_size<vector_extent_type<V>>;
 
 template<typename V>
 using into_vector_type = vector<vector_value_type<V>, vector_extent_type<V>>;
 
 template<typename V>
-using vector_storage_type = vector_storage<vector_value_type<V>, vector_extent<V>>;
+using vector_storage_type = vector_storage<vector_value_type<V>, vector_size<V>>;
 
 template<typename... Vs>
 using promoted_vector_value_type = promote_t<vector_value_type<Vs>...>;
diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h
index 84d2f4c..7db292c 100644
--- a/include/kernel_float/bf16.h
+++ b/include/kernel_float/bf16.h
@@ -11,6 +11,12 @@
 #include "vector.h"
 
 namespace kernel_float {
+
+template<>
+struct preferred_vector_size<__nv_bfloat16> {
+    static constexpr size_t value = 2;
+};
+
 KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__nv_bfloat16)
 KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __nv_bfloat16)
 KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __nv_bfloat16)
@@ -217,58 +223,10 @@ KERNEL_FLOAT_BF16_CAST(
 #endif
 
 using bfloat16 = __nv_bfloat16;
+KERNEL_FLOAT_VECTOR_ALIAS(bfloat16x, __nv_bfloat16)
 //KERNEL_FLOAT_TYPE_ALIAS(float16x, __nv_bfloat16)
 //KERNEL_FLOAT_TYPE_ALIAS(f16x, __nv_bfloat16)
 
-#if KERNEL_FLOAT_CUDA_ARCH >= 800
-namespace detail {
-template<>
-struct dot_impl<__nv_bfloat16, 0> {
-    KERNEL_FLOAT_INLINE
-    static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) {
-        return __nv_bfloat16(0);
-    }
-};
-
-template<>
-struct dot_impl<__nv_bfloat16, 1> {
-    KERNEL_FLOAT_INLINE
-    static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) {
-        return __hmul(left[0], right[0]);
-    }
-};
-
-template<size_t N>
-struct dot_impl<__nv_bfloat16, N> {
-    static_assert(N >= 2, "internal error");
-
-    KERNEL_FLOAT_INLINE
-    static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) {
-        __nv_bfloat162 first_a = {left[0], left[1]};
-        __nv_bfloat162 first_b = {right[0], right[1]};
-        __nv_bfloat162 accum = __hmul2(first_a, first_b);
-
-#pragma unroll
-        for (size_t i = 2; i + 1 < N; i += 2) {
-            __nv_bfloat162 a = {left[i], left[i + 1]};
-            __nv_bfloat162 b = {right[i], right[i + 1]};
-            accum = __hfma2(a, b, accum);
-        }
-
-        __nv_bfloat16 result = __hadd(accum.x, accum.y);
-
-        if (N % 2 != 0) {
-            __nv_bfloat16 a = left[N - 1];
-            __nv_bfloat16 b = right[N - 1];
-            result = __hfma(a, b, result);
-        }
-
-        return result;
-    }
-};
-}  // namespace detail
-#endif
-
 }  // namespace kernel_float
 
 #if KERNEL_FLOAT_FP16_AVAILABLE
diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h
index 82c375b..eb958f1 100644
--- a/include/kernel_float/binops.h
+++ b/include/kernel_float/binops.h
@@ -50,16 +50,9 @@ KERNEL_FLOAT_INLINE zip_common_type<F, L, R> zip_common(F fun, const L& left, co
     using O = result_t<F, T, T>;
     using E = broadcast_vector_extent_type<L, R>;
 
-    vector_storage<O, E::value> result;
+    vector_storage<O, extent_size<E>> result;
 
-// Use the `apply_fastmath_impl` if KERNEL_FLOAT_FAST_MATH is enabled
-#if KERNEL_FLOAT_FAST_MATH
-    using apply_impl = detail::apply_fastmath_impl<F, E::value, O, T, T>;
-#else
-    using apply_impl = detail::apply_impl<F, E::value, O, T, T>;
-#endif
-
-    apply_impl::call(
+    detail::map_impl<F, extent_size<E>, O, T, T>::call(
         fun,
         result.data(),
         detail::convert_impl<vector_value_type<L>, vector_extent_type<L>, T, E>::call(
@@ -304,20 +297,17 @@ struct apply_fastmath_impl<ops::divide<T>, N, T, T, T> {
         T rhs_rcp[N];
 
         // Fast way to perform division is to multiply by the reciprocal
-        apply_fastmath_impl<ops::rcp<T>, N, T, T, T>::call({}, rhs_rcp, rhs);
+        apply_fastmath_impl<ops::rcp<T>, N, T, T>::call({}, rhs_rcp, rhs);
         apply_fastmath_impl<ops::multiply<T>, N, T, T, T>::call({}, result, lhs, rhs_rcp);
     }
 };
 
 #if KERNEL_FLOAT_IS_DEVICE
-template<size_t N>
-struct apply_fastmath_impl<ops::divide<float>, N, float, float, float> {
+template<>
+struct apply_fastmath_impl<ops::divide<float>, 1, float, float, float> {
     KERNEL_FLOAT_INLINE static void
     call(ops::divide<float> fun, float* result, const float* lhs, const float* rhs) {
-#pragma unroll
-        for (size_t i = 0; i < N; i++) {
-            result[i] = __fdividef(lhs[i], rhs[i]);
-        }
+        *result = __fdividef(*lhs, *rhs);
     }
 };
 #endif
@@ -327,9 +317,9 @@ template<typename L, typename R, typename T = promoted_vector_value_type<L, R>>
 KERNEL_FLOAT_INLINE zip_common_type<ops::divide<T>, T, T>
 fast_divide(const L& left, const R& right) {
     using E = broadcast_vector_extent_type<L, R>;
-    vector_storage<T, E::value> result;
+    vector_storage<T, extent_size<E>> result;
 
-    detail::apply_fastmath_impl<ops::divide<T>, E::value, T, T, T>::call(
+    detail::map_policy_impl<fast_policy, ops::divide<T>, extent_size<E>, T, T, T>::call(
         ops::divide<T> {},
         result.data(),
         detail::convert_impl<vector_value_type<L>, vector_extent_type<L>, T, E>::call(
diff --git a/include/kernel_float/conversion.h b/include/kernel_float/conversion.h
index 3e56eaa..8538be7 100644
--- a/include/kernel_float/conversion.h
+++ b/include/kernel_float/conversion.h
@@ -14,10 +14,10 @@ namespace detail {
 template<typename T, typename E, typename T2, typename E2, RoundingMode M = RoundingMode::ANY>
 struct convert_impl {
     KERNEL_FLOAT_INLINE
-    static vector_storage<T2, E2::value> call(vector_storage<T, E::value> input) {
+    static vector_storage<T2, extent_size<E2>> call(vector_storage<T, extent_size<E>> input) {
         using F = ops::cast<T, T2, M>;
-        vector_storage<T2, E::value> intermediate;
-        detail::apply_impl<F, E::value, T2, T>::call(F {}, intermediate.data(), input.data());
+        vector_storage<T2, extent_size<E>> intermediate;
+        detail::map_impl<F, extent_size<E>, T2, T>::call(F {}, intermediate.data(), input.data());
         return detail::broadcast_impl<T2, E, E2>::call(intermediate);
     }
 };
@@ -26,7 +26,7 @@ struct convert_impl {
 template<typename T, typename E, RoundingMode M>
 struct convert_impl<T, E, T, E, M> {
     KERNEL_FLOAT_INLINE
-    static vector_storage<T, E::value> call(vector_storage<T, E::value> input) {
+    static vector_storage<T, extent_size<E>> call(vector_storage<T, extent_size<E>> input) {
         return input;
     }
 };
@@ -35,7 +35,7 @@ struct convert_impl<T, E, T, E, M> {
 template<typename T, typename E, typename E2, RoundingMode M>
 struct convert_impl<T, E, T, E2, M> {
     KERNEL_FLOAT_INLINE
-    static vector_storage<T, E2::value> call(vector_storage<T, E::value> input) {
+    static vector_storage<T, extent_size<E2>> call(vector_storage<T, extent_size<E>> input) {
         return detail::broadcast_impl<T, E, E2>::call(input);
     }
 };
@@ -44,11 +44,11 @@ struct convert_impl<T, E, T, E2, M> {
 template<typename T, typename E, typename T2, RoundingMode M>
 struct convert_impl<T, E, T2, E, M> {
     KERNEL_FLOAT_INLINE
-    static vector_storage<T2, E::value> call(vector_storage<T, E::value> input) {
+    static vector_storage<T2, extent_size<E>> call(vector_storage<T, extent_size<E>> input) {
         using F = ops::cast<T, T2, M>;
 
-        vector_storage<T2, E::value> result;
-        detail::apply_impl<F, E::value, T2, T>::call(F {}, result.data(), input.data());
+        vector_storage<T2, extent_size<E>> result;
+        detail::map_impl<F, extent_size<E>, T2, T>::call(F {}, result.data(), input.data());
         return result;
     }
 };
diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h
index ac760e9..0b62d9b 100644
--- a/include/kernel_float/fp16.h
+++ b/include/kernel_float/fp16.h
@@ -9,6 +9,12 @@
 #include "vector.h"
 
 namespace kernel_float {
+
+template<>
+struct preferred_vector_size<__half> {
+    static constexpr size_t value = 2;
+};
+
 KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__half)
 KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __half)
 KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __half)
@@ -165,58 +171,10 @@ KERNEL_FLOAT_FP16_CAST(unsigned long, __ull2half_rn(input), (unsigned long)(__ha
 KERNEL_FLOAT_FP16_CAST(unsigned long long, __ull2half_rn(input), __half2ull_rz(input));
 
 using half = __half;
+KERNEL_FLOAT_VECTOR_ALIAS(half, __half)
 //KERNEL_FLOAT_TYPE_ALIAS(float16x, __half)
 //KERNEL_FLOAT_TYPE_ALIAS(f16x, __half)
 
-#if KERNEL_FLOAT_IS_DEVICE
-namespace detail {
-template<>
-struct dot_impl<__half, 0> {
-    KERNEL_FLOAT_INLINE
-    static __half call(const __half* left, const __half* right) {
-        return __half(0);
-    }
-};
-
-template<>
-struct dot_impl<__half, 1> {
-    KERNEL_FLOAT_INLINE
-    static __half call(const __half* left, const __half* right) {
-        return __hmul(left[0], right[0]);
-    }
-};
-
-template<size_t N>
-struct dot_impl<__half, N> {
-    static_assert(N >= 2, "internal error");
-
-    KERNEL_FLOAT_INLINE
-    static __half call(const __half* left, const __half* right) {
-        __half2 first_a = {left[0], left[1]};
-        __half2 first_b = {right[0], right[1]};
-        __half2 accum = __hmul2(first_a, first_b);
-
-#pragma unroll
-        for (size_t i = 2; i + 2 <= N; i += 2) {
-            __half2 a = {left[i], left[i + 1]};
-            __half2 b = {right[i], right[i + 1]};
-            accum = __hfma2(a, b, accum);
-        }
-
-        __half result = __hadd(accum.x, accum.y);
-
-        if (N % 2 != 0) {
-            __half a = left[N - 1];
-            __half b = right[N - 1];
-            result = __hfma(a, b, result);
-        }
-
-        return result;
-    }
-};
-}  // namespace detail
-#endif
-
 }  // namespace kernel_float
 
 #endif
diff --git a/include/kernel_float/iterate.h b/include/kernel_float/iterate.h
index fd5e43d..2c7cc4a 100644
--- a/include/kernel_float/iterate.h
+++ b/include/kernel_float/iterate.h
@@ -22,7 +22,7 @@ KERNEL_FLOAT_INLINE void for_each(V&& input, F fun) {
     auto storage = into_vector_storage(input);
 
 #pragma unroll
-    for (size_t i = 0; i < vector_extent<V>; i++) {
+    for (size_t i = 0; i < vector_size<V>; i++) {
         fun(storage.data()[i]);
     }
 }
@@ -86,7 +86,7 @@ KERNEL_FLOAT_INLINE vector<T, extent<N>> range() {
  */
 template<typename V>
 KERNEL_FLOAT_INLINE into_vector_type<V> range_like(const V& = {}) {
-    return range<vector_value_type<V>, vector_extent<V>>();
+    return range<vector_value_type<V>, vector_size<V>>();
 }
 
 /**
@@ -111,11 +111,11 @@ KERNEL_FLOAT_INLINE into_vector_type<V> range_like(const V& = {}) {
  */
 template<typename T = size_t, typename V>
 KERNEL_FLOAT_INLINE vector<T, vector_extent_type<V>> each_index(const V& = {}) {
-    return range<T, vector_extent<V>>();
+    return range<T, vector_size<V>>();
 }
 
 namespace detail {
-template<typename V, typename T = vector_value_type<V>, size_t N = vector_extent<V>>
+template<typename V, typename T = vector_value_type<V>, size_t N = vector_size<V>>
 struct flatten_impl {
     using value_type = typename flatten_impl<T>::value_type;
     static constexpr size_t size = N * flatten_impl<T>::size;
@@ -177,7 +177,7 @@ KERNEL_FLOAT_INLINE flatten_type<V> flatten(const V& input) {
 namespace detail {
 template<typename U, typename V = U, typename T = vector_value_type<V>>
 struct concat_base_impl {
-    static constexpr size_t size = vector_extent<V>;
+    static constexpr size_t size = vector_size<V>;
 
     KERNEL_FLOAT_INLINE static void call(U* output, const V& input) {
         vector_storage<T, size> storage = into_vector_storage(input);
@@ -294,7 +294,7 @@ using select_type = vector<vector_value_type<V>, extent<concat_size<Is...>>>;
 template<typename V, typename... Is>
 KERNEL_FLOAT_INLINE select_type<V, Is...> select(const V& input, const Is&... indices) {
     using T = vector_value_type<V>;
-    static constexpr size_t N = vector_extent<V>;
+    static constexpr size_t N = vector_size<V>;
     static constexpr size_t M = concat_size<Is...>;
 
     vector_storage<size_t, M> index_set;
diff --git a/include/kernel_float/macros.h b/include/kernel_float/macros.h
index 621cdea..4eee8f1 100644
--- a/include/kernel_float/macros.h
+++ b/include/kernel_float/macros.h
@@ -63,8 +63,8 @@
 
 #define KERNEL_FLOAT_MAX_ALIGNMENT (32)
 
-#ifndef KERNEL_FLOAT_FAST_MATH
-#define KERNEL_FLOAT_FAST_MATH (0)
+#if KERNEL_FLOAT_FAST_MATH
+#define KERNEL_FLOAT_POLICY ::kernel_float::fast_policy;
 #endif
 
 #endif  //KERNEL_FLOAT_MACROS_H
diff --git a/include/kernel_float/memory.h b/include/kernel_float/memory.h
index 66308b6..d76ed1e 100644
--- a/include/kernel_float/memory.h
+++ b/include/kernel_float/memory.h
@@ -38,7 +38,7 @@ struct copy_impl<T, N, index_sequence<Is...>> {
  */
 template<typename T, typename I, typename M = bool, typename E = broadcast_vector_extent_type<I, M>>
 KERNEL_FLOAT_INLINE vector<T, E> read(const T* ptr, const I& indices, const M& mask = true) {
-    return detail::copy_impl<T, E::value>::load(
+    return detail::copy_impl<T, extent_size<E>>::load(
         ptr,
         convert_storage<size_t>(indices, E()).data(),
         convert_storage<bool>(mask, E()).data());
@@ -65,7 +65,7 @@ template<
     typename M = bool,
     typename E = broadcast_vector_extent_type<V, I, M>>
 KERNEL_FLOAT_INLINE void write(T* ptr, const I& indices, const V& values, const M& mask = true) {
-    return detail::copy_impl<T, E::value>::store(
+    return detail::copy_impl<T, extent_size<E>>::store(
         ptr,
         convert_storage<T>(values, E()).data(),
         convert_storage<size_t>(indices, E()).data(),
@@ -102,7 +102,7 @@ KERNEL_FLOAT_INLINE vector<T, extent<N>> read(const T* ptr) {
  */
 template<typename V, typename T>
 KERNEL_FLOAT_INLINE void write(T* ptr, const V& values) {
-    static constexpr size_t N = vector_extent<V>;
+    static constexpr size_t N = vector_size<V>;
     write(ptr, range<size_t, N>(), values);
 }
 
@@ -277,7 +277,7 @@ KERNEL_FLOAT_INLINE vector<T, extent<N>> read_aligned(const T* ptr) {
  */
 template<size_t Align, typename V, typename T>
 KERNEL_FLOAT_INLINE void write_aligned(T* ptr, const V& values) {
-    static constexpr size_t N = vector_extent<V>;
+    static constexpr size_t N = vector_size<V>;
     static constexpr size_t alignment = detail::gcd(Align * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT);
 
     return detail::copy_aligned_impl<T, N, alignment>::store(
diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h
index 8ec790d..c859265 100644
--- a/include/kernel_float/reduce.h
+++ b/include/kernel_float/reduce.h
@@ -2,6 +2,7 @@
 #define KERNEL_FLOAT_REDUCE_H
 
 #include "binops.h"
+#include "triops.h"
 
 namespace kernel_float {
 namespace detail {
@@ -23,7 +24,7 @@ struct reduce_recur_impl {
     template<typename F, typename T>
     KERNEL_FLOAT_INLINE static T call(F fun, const T* input) {
         vector_storage<T, K> temp;
-        apply_impl<F, N - K, T, T, T>::call(fun, temp.data(), input, input + K);
+        map_impl<F, N - K, T, T, T>::call(fun, temp.data(), input, input + K);
 
         if constexpr (N < 2 * K) {
 #pragma unroll
@@ -73,7 +74,7 @@ struct reduce_recur_impl<2> {
  */
 template<typename F, typename V>
 KERNEL_FLOAT_INLINE vector_value_type<V> reduce(F fun, const V& input) {
-    return detail::reduce_impl<F, vector_extent<V>, vector_value_type<V>>::call(
+    return detail::reduce_impl<F, vector_size<V>, vector_value_type<V>>::call(
         fun,
         into_vector_storage(input).data());
 }
@@ -177,14 +178,38 @@ template<typename T, size_t N>
 struct dot_impl {
     KERNEL_FLOAT_INLINE
     static T call(const T* left, const T* right) {
-        vector_storage<T, N> intermediate;
-        detail::apply_impl<ops::multiply<T>, N, T, T, T>::call(
-            ops::multiply<T>(),
-            intermediate.data(),
-            left,
-            right);
-
-        return detail::reduce_impl<ops::add<T>, N, T>::call(ops::add<T>(), intermediate.data());
+        static constexpr size_t K = preferred_vector_size<T>::value;
+        T result = {};
+
+        if constexpr (N / K > 0) {
+            T accum[K] = {T {}};
+            apply_impl<ops::multiply<T>, K, T, T, T>::call({}, accum, left, right);
+
+#pragma unroll
+            for (size_t i = 1; i < N / K; i++) {
+                apply_impl<ops::fma<T>, K, T, T, T, T>::call(
+                    ops::fma<T> {},
+                    accum,
+                    left + i * K,
+                    right + i * K,
+                    accum);
+            }
+
+            result = reduce_impl<ops::add<T>, K, T>::call({}, accum);
+        }
+
+        if constexpr (N % K > 0) {
+            for (size_t i = N - N % K; i < N; i++) {
+                apply_impl<ops::fma<T>, 1, T, T, T, T>::call(
+                    {},
+                    &result,
+                    left + i,
+                    right + i,
+                    &result);
+            }
+        }
+
+        return result;
     }
 };
 }  // namespace detail
@@ -203,7 +228,7 @@ struct dot_impl {
 template<typename L, typename R, typename T = promoted_vector_value_type<L, R>>
 KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) {
     using E = broadcast_vector_extent_type<L, R>;
-    return detail::dot_impl<T, E::value>::call(
+    return detail::dot_impl<T, extent_size<E>>::call(
         convert_storage<T>(left, E {}).data(),
         convert_storage<T>(right, E {}).data());
 }
@@ -273,7 +298,7 @@ struct magnitude_impl<double, 3> {
  */
 template<typename V, typename T = vector_value_type<V>>
 KERNEL_FLOAT_INLINE T mag(const V& input) {
-    return detail::magnitude_impl<T, vector_extent<V>>::call(into_vector_storage(input).data());
+    return detail::magnitude_impl<T, vector_size<V>>::call(into_vector_storage(input).data());
 }
 }  // namespace kernel_float
 
diff --git a/include/kernel_float/triops.h b/include/kernel_float/triops.h
index 44f6db2..12cca59 100644
--- a/include/kernel_float/triops.h
+++ b/include/kernel_float/triops.h
@@ -39,9 +39,9 @@ template<
     typename E = broadcast_vector_extent_type<C, L, R>>
 KERNEL_FLOAT_INLINE vector<T, E> where(const C& cond, const L& true_values, const R& false_values) {
     using F = ops::conditional<T>;
-    vector_storage<T, E::value> result;
+    vector_storage<T, extent_size<E>> result;
 
-    detail::apply_impl<F, E::value, T, bool, T, T>::call(
+    detail::map_impl<F, extent_size<E>, T, bool, T, T>::call(
         F {},
         result.data(),
         detail::convert_impl<vector_value_type<C>, vector_extent_type<C>, bool, E>::call(
@@ -92,11 +92,25 @@ namespace ops {
 template<typename T>
 struct fma {
     KERNEL_FLOAT_INLINE T operator()(T a, T b, T c) {
-        return a * b + c;
+        return ops::add<T> {}(ops::multiply<T> {}(a, b), c);
     }
 };
+}  // namespace ops
+
+namespace detail {
+template<typename T, size_t N>
+struct apply_impl<ops::fma<T>, N, T, T, T, T> {
+    KERNEL_FLOAT_INLINE
+    static void call(ops::fma<T>, T* output, const T* a, const T* b, const T* c) {
+        T temp[N];
+        apply_impl<ops::multiply<T>, N, T, T, T>::call({}, temp, a, b);
+        apply_impl<ops::add<T>, N, T, T, T>::call({}, output, temp, c);
+    }
+};
+}  // namespace detail
 
 #if KERNEL_FLOAT_IS_DEVICE
+namespace ops {
 template<>
 struct fma<float> {
     KERNEL_FLOAT_INLINE float operator()(float a, float b, float c) {
@@ -110,11 +124,11 @@ struct fma<double> {
         return __fma_rn(a, b, c);
     }
 };
-#endif
 }  // namespace ops
+#endif
 
 /**
- * Computes the result of `a * b + c`. This is done in a single operation if possible.
+ * Computes the result of `a * b + c`. This is done in a single operation if possible for the given vector type.
  */
 template<
     typename A,
@@ -124,9 +138,9 @@ template<
     typename E = broadcast_vector_extent_type<A, B, C>>
 KERNEL_FLOAT_INLINE vector<T, E> fma(const A& a, const B& b, const C& c) {
     using F = ops::fma<T>;
-    vector_storage<T, E::value> result;
+    vector_storage<T, extent_size<E>> result;
 
-    detail::apply_impl<F, E::value, T, T, T, T>::call(
+    detail::map_impl<F, extent_size<E>, T, T, T, T>::call(
         F {},
         result.data(),
         detail::convert_impl<vector_value_type<A>, vector_extent_type<A>, T, E>::call(
diff --git a/include/kernel_float/unops.h b/include/kernel_float/unops.h
index 70d629c..fce130e 100644
--- a/include/kernel_float/unops.h
+++ b/include/kernel_float/unops.h
@@ -85,10 +85,10 @@ KERNEL_FLOAT_INLINE vector<R, vector_extent_type<V>> cast(const V& input) {
 }
 
 #define KERNEL_FLOAT_DEFINE_UNARY_FUN(NAME)                                                        \
-    template<typename V>                                                                           \
+    template<typename Accuracy = default_policy, typename V>                                       \
     KERNEL_FLOAT_INLINE vector<vector_value_type<V>, vector_extent_type<V>> NAME(const V& input) { \
         using F = ops::NAME<vector_value_type<V>>;                                                 \
-        return map(F {}, input);                                                                   \
+        return ::kernel_float::map<Accuracy>(F {}, input);                                         \
     }
 
 #define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR)       \
@@ -166,7 +166,7 @@ KERNEL_FLOAT_DEFINE_UNARY_MATH(ilogb)
 KERNEL_FLOAT_DEFINE_UNARY_MATH(lgamma)
 KERNEL_FLOAT_DEFINE_UNARY_MATH(log)
 KERNEL_FLOAT_DEFINE_UNARY_MATH(log10)
-KERNEL_FLOAT_DEFINE_UNARY_MATH(logb)
+KERNEL_FLOAT_DEFINE_UNARY_MATH(log2)
 KERNEL_FLOAT_DEFINE_UNARY_MATH(nearbyint)
 KERNEL_FLOAT_DEFINE_UNARY_MATH(normcdf)
 KERNEL_FLOAT_DEFINE_UNARY_MATH(rcbrt)
@@ -193,12 +193,11 @@ KERNEL_FLOAT_DEFINE_UNARY_STRUCT(rcp, 1.0 / input, 1.0f / input)
 
 KERNEL_FLOAT_DEFINE_UNARY_FUN(rcp)
 
-#define KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(NAME)                                         \
-    template<typename V>                                                                 \
-    KERNEL_FLOAT_INLINE vector<vector_value_type<V>, vector_extent_type<V>> fast_##NAME( \
-        const V& input) {                                                                \
-        using F = ops::NAME<vector_value_type<V>>;                                       \
-        return fast_map(F {}, input);                                                    \
+#define KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(NAME)                                            \
+    template<typename V>                                                                    \
+    KERNEL_FLOAT_INLINE vector<vector_value_type<V>, vector_extent_type<V>> fast_##NAME(    \
+        const V& input) {                                                                   \
+        return ::kernel_float::map<fast_policy>(ops::NAME<vector_value_type<V>> {}, input); \
     }
 
 KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(exp)
@@ -209,17 +208,17 @@ KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(rsqrt)
 KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(sin)
 KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(cos)
 KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(tan)
+KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(exp2)
+KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(log2)
 
 #if KERNEL_FLOAT_IS_DEVICE
 
 #define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(T, F, FAST_FUN)                       \
     namespace detail {                                                                \
-    template<size_t N>                                                                \
-    struct apply_fastmath_impl<ops::F<T>, N, T, T> {                                  \
+    template<>                                                                        \
+    struct apply_fastmath_impl<ops::F<T>, 1, T, T> {                                  \
         KERNEL_FLOAT_INLINE static void call(ops::F<T>, T* result, const T* inputs) { \
-            for (size_t i = 0; i < N; i++) {                                          \
-                result[i] = FAST_FUN(inputs[i]);                                      \
-            }                                                                         \
+            *result = FAST_FUN(*inputs);                                              \
         }                                                                             \
     };                                                                                \
     }
@@ -227,26 +226,28 @@ KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(tan)
 KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(float, exp, __expf)
 KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(float, log, __logf)
 
-#define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(T, F, INSTR)                              \
+#define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(T, F, INSTR, REG)                         \
     namespace detail {                                                                    \
-    template<size_t N>                                                                    \
-    struct apply_fastmath_impl<ops::F<T>, N, T, T> {                                      \
+    template<>                                                                            \
+    struct apply_fastmath_impl<ops::F<T>, 1, T, T> {                                      \
         KERNEL_FLOAT_INLINE static void call(ops::F<T> fun, T* result, const T* inputs) { \
-            for (size_t i = 0; i < N; i++) {                                              \
-                asm(INSTR, : "=f"(result[i]) : "f"(inputs[i]));                           \
-            }                                                                             \
+            asm(INSTR : "=" REG(*result) : REG(*inputs));                                 \
         }                                                                                 \
     };                                                                                    \
     }
 
-KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rcp, "rcp.approx.ftz.f64 %0, %1")
-KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rsqrt, "rsqrt.approx.f64 %0, %1")
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rcp, "rcp.approx.ftz.f64 %0, %1;", "d")
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rsqrt, "rsqrt.approx.f64 %0, %1;", "d")
 
-KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sqrt, "sqrt.approx.f32 %0, %1")
-KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rcp, "rcp.approx.f32 %0, %1")
-KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rsqrt, "rsqrt.approx.f32 %0, %1")
-KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sin, "sin.approx.f32 %0, %1")
-KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, cos, "cos.approx.f32 %0, %1")
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sqrt, "sqrt.approx.f32 %0, %1;", "f")
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rcp, "rcp.approx.f32 %0, %1;", "f")
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rsqrt, "rsqrt.approx.f32 %0, %1;", "f")
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sin, "sin.approx.f32 %0, %1;", "f")
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, cos, "cos.approx.f32 %0, %1;", "f")
+
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, exp2, "ex2.approx.f32 %0, %1;", "f")
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, log2, "lg2.approx.f32 %0, %1;", "f")
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, tanh, "tanh.approx.f32 %0, %1;", "f")
 
 #endif
 
diff --git a/include/kernel_float/vector.h b/include/kernel_float/vector.h
index bf9395b..e52fa02 100644
--- a/include/kernel_float/vector.h
+++ b/include/kernel_float/vector.h
@@ -54,7 +54,7 @@ struct vector: public S {
         typename A,
         typename B,
         typename... Rest,
-        typename = enable_if_t<sizeof...(Rest) + 2 == E::size>>
+        typename = enable_if_t<sizeof...(Rest) + 2 == extent_size<E>>>
     KERNEL_FLOAT_INLINE vector(const A& a, const B& b, const Rest&... rest) :
         storage_type {T(a), T(b), T(rest)...} {}
 
@@ -63,7 +63,7 @@ struct vector: public S {
      */
     KERNEL_FLOAT_INLINE
     static constexpr size_t size() {
-        return E::size;
+        return extent_size<E>;
     }
 
     /**
@@ -276,6 +276,21 @@ struct vector: public S {
     KERNEL_FLOAT_INLINE void for_each(F fun) const {
         return kernel_float::for_each(*this, std::move(fun));
     }
+
+    /**
+     * Returns the result of `*this + lhs * rhs`.
+     *
+     * The operation is performed using a single `kernel_float::fma` call, which may be faster then perform
+     * the addition and multiplication separately.
+     */
+    template<
+        typename L,
+        typename R,
+        typename T2 = promote_t<T, vector_value_type<L>, vector_value_type<R>>,
+        typename E2 = broadcast_extent<E, vector_extent_type<L>, vector_extent_type<R>>>
+    KERNEL_FLOAT_INLINE vector<T2, E2> fma(const L& lhs, const R& rhs) const {
+        return ::kernel_float::fma(lhs, rhs, *this);
+    }
 };
 
 /**
@@ -308,6 +323,21 @@ template<typename T> using vec7 = vec<T, 7>;
 template<typename T> using vec8 = vec<T, 8>;
 // clang-format on
 
+#define KERNEL_FLOAT_VECTOR_ALIAS(NAME, T) \
+    template<size_t N>                     \
+    using NAME##1 = vec<T, 1>;             \
+    using NAME##2 = vec<T, 2>;             \
+    using NAME##3 = vec<T, 3>;             \
+    using NAME##4 = vec<T, 4>;             \
+    using NAME##5 = vec<T, 5>;             \
+    using NAME##6 = vec<T, 6>;             \
+    using NAME##7 = vec<T, 7>;             \
+    using NAME##8 = vec<T, 8>;
+
+KERNEL_FLOAT_VECTOR_ALIAS(int, int)
+KERNEL_FLOAT_VECTOR_ALIAS(float, float)
+KERNEL_FLOAT_VECTOR_ALIAS(double, double)
+
 /**
  * Create a vector from a variable number of input values.
  *
diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h
index 42a5814..a79da18 100644
--- a/single_include/kernel_float.h
+++ b/single_include/kernel_float.h
@@ -16,8 +16,8 @@
 
 //================================================================================
 // this file has been auto-generated, do not modify its contents!
-// date: 2024-05-17 10:55:41.948281
-// git hash: 41246ab6db9fcc24639342c439e606ba143ee346
+// date: 2024-09-23 14:12:25.024358
+// git hash: 3a88b56a57cce5e1f3365aa6e8efb76a14f7f865
 //================================================================================
 
 #ifndef KERNEL_FLOAT_MACROS_H
@@ -85,8 +85,8 @@
 
 #define KERNEL_FLOAT_MAX_ALIGNMENT (32)
 
-#ifndef KERNEL_FLOAT_FAST_MATH
-#define KERNEL_FLOAT_FAST_MATH (0)
+#if KERNEL_FLOAT_FAST_MATH
+#define KERNEL_FLOAT_POLICY ::kernel_float::fast_policy;
 #endif
 
 #endif  //KERNEL_FLOAT_MACROS_H
@@ -424,7 +424,6 @@ struct alignas(Alignment) aligned_array<T, 1, Alignment> {
 };
 
 template<typename T, size_t Alignment>
-
 struct aligned_array<T, 0, Alignment> {
     KERNEL_FLOAT_INLINE
     T* data() {
@@ -464,10 +463,12 @@ struct extent;
 
 template<size_t N>
 struct extent<N> {
-    static constexpr size_t value = N;
-    static constexpr size_t size = N;
+    static constexpr size_t number_of_elements = N;
 };
 
+template<typename E>
+static constexpr size_t extent_size = E::number_of_elements;
+
 namespace detail {
 // Indicates that elements of type `T` offer less precision than floats, thus operations
 // on elements of type `T` can be performed by upcasting them to ` float`.
@@ -594,7 +595,7 @@ KERNEL_FLOAT_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong1, ulonglong2, ulon
 KERNEL_FLOAT_DEFINE_VECTOR_TYPE(float, float1, float2, float3, float4)
 KERNEL_FLOAT_DEFINE_VECTOR_TYPE(double, double1, double2, double3, double4)
 
-template<typename T, typename E, typename S = vector_storage<T, E::size>>
+template<typename T, typename E, typename S = vector_storage<T, E::number_of_elements>>
 struct vector;
 
 template<typename T, typename E, typename S>
@@ -603,11 +604,16 @@ struct into_vector_impl<vector<T, E, S>> {
     using extent_type = E;
 
     KERNEL_FLOAT_INLINE
-    static vector_storage<T, E::value> call(const vector<T, E, S>& input) {
+    static vector_storage<T, extent_size<E>> call(const vector<T, E, S>& input) {
         return input.storage();
     }
 };
 
+template<typename T>
+struct preferred_vector_size {
+    static constexpr size_t value = 1;
+};
+
 template<typename V>
 struct vector_traits;
 
@@ -626,13 +632,13 @@ template<typename V>
 using vector_extent_type = typename into_vector_impl<V>::extent_type;
 
 template<typename V>
-static constexpr size_t vector_extent = vector_extent_type<V>::value;
+static constexpr size_t vector_size = extent_size<vector_extent_type<V>>;
 
 template<typename V>
 using into_vector_type = vector<vector_value_type<V>, vector_extent_type<V>>;
 
 template<typename V>
-using vector_storage_type = vector_storage<vector_value_type<V>, vector_extent<V>>;
+using vector_storage_type = vector_storage<vector_value_type<V>, vector_size<V>>;
 
 template<typename... Vs>
 using promoted_vector_value_type = promote_t<vector_value_type<Vs>...>;
@@ -765,43 +771,66 @@ broadcast_like(const V& input, const R& other) {
 
 namespace detail {
 
-template<size_t N>
-struct apply_recur_impl;
-
 template<typename F, size_t N, typename Output, typename... Args>
 struct apply_impl {
-    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {
-        apply_recur_impl<N>::call(fun, result, inputs...);
+    KERNEL_FLOAT_INLINE static void call(F fun, Output* output, const Args*... args) {
+#pragma unroll
+        for (size_t i = 0; i < N; i++) {
+            output[i] = fun(args[i]...);
+        }
     }
 };
 
-template<size_t N>
-struct apply_recur_impl {
-    static constexpr size_t K = round_up_to_power_of_two(N) / 2;
+template<typename F, size_t N, typename Output, typename... Args>
+struct apply_fastmath_impl: apply_impl<F, N, Output, Args...> {};
+}  // namespace detail
 
-    template<typename F, typename Output, typename... Args>
-    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {
-        apply_impl<F, K, Output, Args...>::call(fun, result, inputs...);
-        apply_impl<F, N - K, Output, Args...>::call(fun, result + K, (inputs + K)...);
-    }
+struct accurate_policy {
+    template<typename F, size_t N, typename Output, typename... Args>
+    using type = detail::apply_impl<F, N, Output, Args...>;
 };
 
-template<>
-struct apply_recur_impl<0> {
-    template<typename F, typename Output, typename... Args>
-    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {}
+struct fast_policy {
+    template<typename F, size_t N, typename Output, typename... Args>
+    using type = detail::apply_fastmath_impl<F, N, Output, Args...>;
 };
 
-template<>
-struct apply_recur_impl<1> {
-    template<typename F, typename Output, typename... Args>
-    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {
-        result[0] = fun(inputs[0]...);
+#ifdef KERNEL_FLOAT_POLICY
+using default_policy = KERNEL_FLOAT_POLICY;
+#else
+using default_policy = accurate_policy;
+#endif
+
+namespace detail {
+
+template<typename Policy, typename F, size_t N, typename Output, typename... Args>
+struct map_policy_impl {
+    static constexpr size_t packet_size = preferred_vector_size<Output>::value;
+    static constexpr size_t remainder = N % packet_size;
+
+    KERNEL_FLOAT_INLINE static void call(F fun, Output* output, const Args*... args) {
+        if constexpr (N / packet_size > 0) {
+#pragma unroll
+            for (size_t i = 0; i < N - remainder; i += packet_size) {
+                Policy::template type<F, packet_size, Output, Args...>::call(
+                    fun,
+                    output + i,
+                    (args + i)...);
+            }
+        }
+
+        if constexpr (remainder > 0) {
+#pragma unroll
+            for (size_t i = N - remainder; i < N; i++) {
+                Policy::template type<F, 1, Output, Args...>::call(fun, output + i, (args + i)...);
+            }
+        }
     }
 };
 
 template<typename F, size_t N, typename Output, typename... Args>
-struct apply_fastmath_impl: apply_impl<F, N, Output, Args...> {};
+using map_impl = map_policy_impl<default_policy, F, N, Output, Args...>;
+
 }  // namespace detail
 
 template<typename F, typename... Args>
@@ -818,40 +847,13 @@ using map_type =
  * vec<float, 4> squared = map([](auto x) { return x * x; }, input); // [1.0f, 4.0f, 9.0f, 16.0f]
  * ```
  */
-template<typename F, typename... Args>
+template<typename Accuracy = default_policy, typename F, typename... Args>
 KERNEL_FLOAT_INLINE map_type<F, Args...> map(F fun, const Args&... args) {
     using Output = result_t<F, vector_value_type<Args>...>;
     using E = broadcast_vector_extent_type<Args...>;
-    vector_storage<Output, E::value> result;
-
-    // Use the `apply_fastmath_impl` if KERNEL_FLOAT_FAST_MATH is enabled
-#if KERNEL_FLOAT_FAST_MATH
-    using apply_impl = detail::apply_fastmath_impl<F, E::value, Output, vector_value_type<Args>...>;
-#else
-    using apply_impl = detail::apply_impl<F, E::value, Output, vector_value_type<Args>...>;
-#endif
-
-    apply_impl::call(
-        fun,
-        result.data(),
-        (detail::broadcast_impl<vector_value_type<Args>, vector_extent_type<Args>, E>::call(
-             into_vector_storage(args))
-             .data())...);
-
-    return result;
-}
-
-/**
- * Apply the function `F` to each element from the vector `input` and return the results as a new vector. This
- * uses fast-math if available for the given function `F`, otherwise this function behaves like `map`.
- */
-template<typename F, typename... Args>
-KERNEL_FLOAT_INLINE map_type<F, Args...> fast_map(F fun, const Args&... args) {
-    using Output = result_t<F, vector_value_type<Args>...>;
-    using E = broadcast_vector_extent_type<Args...>;
-    vector_storage<Output, E::value> result;
+    vector_storage<Output, extent_size<E>> result;
 
-    detail::apply_fastmath_impl<F, E::value, Output, vector_value_type<Args>...>::call(
+    detail::map_policy_impl<Accuracy, F, extent_size<E>, Output, vector_value_type<Args>...>::call(
         fun,
         result.data(),
         (detail::broadcast_impl<vector_value_type<Args>, vector_extent_type<Args>, E>::call(
@@ -1200,10 +1202,10 @@ KERNEL_FLOAT_INLINE vector<R, vector_extent_type<V>> cast(const V& input) {
 }
 
 #define KERNEL_FLOAT_DEFINE_UNARY_FUN(NAME)                                                        \
-    template<typename V>                                                                           \
+    template<typename Accuracy = default_policy, typename V>                                       \
     KERNEL_FLOAT_INLINE vector<vector_value_type<V>, vector_extent_type<V>> NAME(const V& input) { \
         using F = ops::NAME<vector_value_type<V>>;                                                 \
-        return map(F {}, input);                                                                   \
+        return ::kernel_float::map<Accuracy>(F {}, input);                                         \
     }
 
 #define KERNEL_FLOAT_DEFINE_UNARY(NAME, EXPR)       \
@@ -1281,7 +1283,7 @@ KERNEL_FLOAT_DEFINE_UNARY_MATH(ilogb)
 KERNEL_FLOAT_DEFINE_UNARY_MATH(lgamma)
 KERNEL_FLOAT_DEFINE_UNARY_MATH(log)
 KERNEL_FLOAT_DEFINE_UNARY_MATH(log10)
-KERNEL_FLOAT_DEFINE_UNARY_MATH(logb)
+KERNEL_FLOAT_DEFINE_UNARY_MATH(log2)
 KERNEL_FLOAT_DEFINE_UNARY_MATH(nearbyint)
 KERNEL_FLOAT_DEFINE_UNARY_MATH(normcdf)
 KERNEL_FLOAT_DEFINE_UNARY_MATH(rcbrt)
@@ -1308,12 +1310,11 @@ KERNEL_FLOAT_DEFINE_UNARY_STRUCT(rcp, 1.0 / input, 1.0f / input)
 
 KERNEL_FLOAT_DEFINE_UNARY_FUN(rcp)
 
-#define KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(NAME)                                         \
-    template<typename V>                                                                 \
-    KERNEL_FLOAT_INLINE vector<vector_value_type<V>, vector_extent_type<V>> fast_##NAME( \
-        const V& input) {                                                                \
-        using F = ops::NAME<vector_value_type<V>>;                                       \
-        return fast_map(F {}, input);                                                    \
+#define KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(NAME)                                            \
+    template<typename V>                                                                    \
+    KERNEL_FLOAT_INLINE vector<vector_value_type<V>, vector_extent_type<V>> fast_##NAME(    \
+        const V& input) {                                                                   \
+        return ::kernel_float::map<fast_policy>(ops::NAME<vector_value_type<V>> {}, input); \
     }
 
 KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(exp)
@@ -1324,17 +1325,17 @@ KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(rsqrt)
 KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(sin)
 KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(cos)
 KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(tan)
+KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(exp2)
+KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(log2)
 
 #if KERNEL_FLOAT_IS_DEVICE
 
 #define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(T, F, FAST_FUN)                       \
     namespace detail {                                                                \
-    template<size_t N>                                                                \
-    struct apply_fastmath_impl<ops::F<T>, N, T, T> {                                  \
+    template<>                                                                        \
+    struct apply_fastmath_impl<ops::F<T>, 1, T, T> {                                  \
         KERNEL_FLOAT_INLINE static void call(ops::F<T>, T* result, const T* inputs) { \
-            for (size_t i = 0; i < N; i++) {                                          \
-                result[i] = FAST_FUN(inputs[i]);                                      \
-            }                                                                         \
+            *result = FAST_FUN(*inputs);                                              \
         }                                                                             \
     };                                                                                \
     }
@@ -1342,26 +1343,28 @@ KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(tan)
 KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(float, exp, __expf)
 KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(float, log, __logf)
 
-#define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(T, F, INSTR)                              \
+#define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(T, F, INSTR, REG)                         \
     namespace detail {                                                                    \
-    template<size_t N>                                                                    \
-    struct apply_fastmath_impl<ops::F<T>, N, T, T> {                                      \
+    template<>                                                                            \
+    struct apply_fastmath_impl<ops::F<T>, 1, T, T> {                                      \
         KERNEL_FLOAT_INLINE static void call(ops::F<T> fun, T* result, const T* inputs) { \
-            for (size_t i = 0; i < N; i++) {                                              \
-                asm(INSTR, : "=f"(result[i]) : "f"(inputs[i]));                           \
-            }                                                                             \
+            asm(INSTR : "=" REG(*result) : REG(*inputs));                                 \
         }                                                                                 \
     };                                                                                    \
     }
 
-KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rcp, "rcp.approx.ftz.f64 %0, %1")
-KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rsqrt, "rsqrt.approx.f64 %0, %1")
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rcp, "rcp.approx.ftz.f64 %0, %1;", "d")
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rsqrt, "rsqrt.approx.f64 %0, %1;", "d")
 
-KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sqrt, "sqrt.approx.f32 %0, %1")
-KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rcp, "rcp.approx.f32 %0, %1")
-KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rsqrt, "rsqrt.approx.f32 %0, %1")
-KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sin, "sin.approx.f32 %0, %1")
-KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, cos, "cos.approx.f32 %0, %1")
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sqrt, "sqrt.approx.f32 %0, %1;", "f")
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rcp, "rcp.approx.f32 %0, %1;", "f")
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rsqrt, "rsqrt.approx.f32 %0, %1;", "f")
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sin, "sin.approx.f32 %0, %1;", "f")
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, cos, "cos.approx.f32 %0, %1;", "f")
+
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, exp2, "ex2.approx.f32 %0, %1;", "f")
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, log2, "lg2.approx.f32 %0, %1;", "f")
+KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, tanh, "tanh.approx.f32 %0, %1;", "f")
 
 #endif
 
@@ -1384,10 +1387,10 @@ namespace detail {
 template<typename T, typename E, typename T2, typename E2, RoundingMode M = RoundingMode::ANY>
 struct convert_impl {
     KERNEL_FLOAT_INLINE
-    static vector_storage<T2, E2::value> call(vector_storage<T, E::value> input) {
+    static vector_storage<T2, extent_size<E2>> call(vector_storage<T, extent_size<E>> input) {
         using F = ops::cast<T, T2, M>;
-        vector_storage<T2, E::value> intermediate;
-        detail::apply_impl<F, E::value, T2, T>::call(F {}, intermediate.data(), input.data());
+        vector_storage<T2, extent_size<E>> intermediate;
+        detail::map_impl<F, extent_size<E>, T2, T>::call(F {}, intermediate.data(), input.data());
         return detail::broadcast_impl<T2, E, E2>::call(intermediate);
     }
 };
@@ -1396,7 +1399,7 @@ struct convert_impl {
 template<typename T, typename E, RoundingMode M>
 struct convert_impl<T, E, T, E, M> {
     KERNEL_FLOAT_INLINE
-    static vector_storage<T, E::value> call(vector_storage<T, E::value> input) {
+    static vector_storage<T, extent_size<E>> call(vector_storage<T, extent_size<E>> input) {
         return input;
     }
 };
@@ -1405,7 +1408,7 @@ struct convert_impl<T, E, T, E, M> {
 template<typename T, typename E, typename E2, RoundingMode M>
 struct convert_impl<T, E, T, E2, M> {
     KERNEL_FLOAT_INLINE
-    static vector_storage<T, E2::value> call(vector_storage<T, E::value> input) {
+    static vector_storage<T, extent_size<E2>> call(vector_storage<T, extent_size<E>> input) {
         return detail::broadcast_impl<T, E, E2>::call(input);
     }
 };
@@ -1414,11 +1417,11 @@ struct convert_impl<T, E, T, E2, M> {
 template<typename T, typename E, typename T2, RoundingMode M>
 struct convert_impl<T, E, T2, E, M> {
     KERNEL_FLOAT_INLINE
-    static vector_storage<T2, E::value> call(vector_storage<T, E::value> input) {
+    static vector_storage<T2, extent_size<E>> call(vector_storage<T, extent_size<E>> input) {
         using F = ops::cast<T, T2, M>;
 
-        vector_storage<T2, E::value> result;
-        detail::apply_impl<F, E::value, T2, T>::call(F {}, result.data(), input.data());
+        vector_storage<T2, extent_size<E>> result;
+        detail::map_impl<F, extent_size<E>, T2, T>::call(F {}, result.data(), input.data());
         return result;
     }
 };
@@ -1637,16 +1640,9 @@ KERNEL_FLOAT_INLINE zip_common_type<F, L, R> zip_common(F fun, const L& left, co
     using O = result_t<F, T, T>;
     using E = broadcast_vector_extent_type<L, R>;
 
-    vector_storage<O, E::value> result;
+    vector_storage<O, extent_size<E>> result;
 
-// Use the `apply_fastmath_impl` if KERNEL_FLOAT_FAST_MATH is enabled
-#if KERNEL_FLOAT_FAST_MATH
-    using apply_impl = detail::apply_fastmath_impl<F, E::value, O, T, T>;
-#else
-    using apply_impl = detail::apply_impl<F, E::value, O, T, T>;
-#endif
-
-    apply_impl::call(
+    detail::map_impl<F, extent_size<E>, O, T, T>::call(
         fun,
         result.data(),
         detail::convert_impl<vector_value_type<L>, vector_extent_type<L>, T, E>::call(
@@ -1890,20 +1886,18 @@ struct apply_fastmath_impl<ops::divide<T>, N, T, T, T> {
     call(ops::divide<T> fun, T* result, const T* lhs, const T* rhs) {
         T rhs_rcp[N];
 
-        apply_fastmath_impl<ops::rcp<T>, N, T, T, T>::call({}, rhs_rcp, rhs);
+        // Fast way to perform division is to multiply by the reciprocal
+        apply_fastmath_impl<ops::rcp<T>, N, T, T>::call({}, rhs_rcp, rhs);
         apply_fastmath_impl<ops::multiply<T>, N, T, T, T>::call({}, result, lhs, rhs_rcp);
     }
 };
 
 #if KERNEL_FLOAT_IS_DEVICE
-template<size_t N>
-struct apply_fastmath_impl<ops::divide<float>, N, float, float, float> {
+template<>
+struct apply_fastmath_impl<ops::divide<float>, 1, float, float, float> {
     KERNEL_FLOAT_INLINE static void
     call(ops::divide<float> fun, float* result, const float* lhs, const float* rhs) {
-#pragma unroll
-        for (size_t i = 0; i < N; i++) {
-            result[i] = __fdividef(lhs[i], rhs[i]);
-        }
+        *result = __fdividef(*lhs, *rhs);
     }
 };
 #endif
@@ -1913,9 +1907,9 @@ template<typename L, typename R, typename T = promoted_vector_value_type<L, R>>
 KERNEL_FLOAT_INLINE zip_common_type<ops::divide<T>, T, T>
 fast_divide(const L& left, const R& right) {
     using E = broadcast_vector_extent_type<L, R>;
-    vector_storage<T, E::value> result;
+    vector_storage<T, extent_size<E>> result;
 
-    detail::apply_fastmath_impl<ops::divide<T>, E::value, T, T, T>::call(
+    detail::map_policy_impl<fast_policy, ops::divide<T>, extent_size<E>, T, T, T>::call(
         ops::divide<T> {},
         result.data(),
         detail::convert_impl<vector_value_type<L>, vector_extent_type<L>, T, E>::call(
@@ -2133,7 +2127,7 @@ KERNEL_FLOAT_INLINE void for_each(V&& input, F fun) {
     auto storage = into_vector_storage(input);
 
 #pragma unroll
-    for (size_t i = 0; i < vector_extent<V>; i++) {
+    for (size_t i = 0; i < vector_size<V>; i++) {
         fun(storage.data()[i]);
     }
 }
@@ -2197,7 +2191,7 @@ KERNEL_FLOAT_INLINE vector<T, extent<N>> range() {
  */
 template<typename V>
 KERNEL_FLOAT_INLINE into_vector_type<V> range_like(const V& = {}) {
-    return range<vector_value_type<V>, vector_extent<V>>();
+    return range<vector_value_type<V>, vector_size<V>>();
 }
 
 /**
@@ -2222,11 +2216,11 @@ KERNEL_FLOAT_INLINE into_vector_type<V> range_like(const V& = {}) {
  */
 template<typename T = size_t, typename V>
 KERNEL_FLOAT_INLINE vector<T, vector_extent_type<V>> each_index(const V& = {}) {
-    return range<T, vector_extent<V>>();
+    return range<T, vector_size<V>>();
 }
 
 namespace detail {
-template<typename V, typename T = vector_value_type<V>, size_t N = vector_extent<V>>
+template<typename V, typename T = vector_value_type<V>, size_t N = vector_size<V>>
 struct flatten_impl {
     using value_type = typename flatten_impl<T>::value_type;
     static constexpr size_t size = N * flatten_impl<T>::size;
@@ -2288,7 +2282,7 @@ KERNEL_FLOAT_INLINE flatten_type<V> flatten(const V& input) {
 namespace detail {
 template<typename U, typename V = U, typename T = vector_value_type<V>>
 struct concat_base_impl {
-    static constexpr size_t size = vector_extent<V>;
+    static constexpr size_t size = vector_size<V>;
 
     KERNEL_FLOAT_INLINE static void call(U* output, const V& input) {
         vector_storage<T, size> storage = into_vector_storage(input);
@@ -2405,7 +2399,7 @@ using select_type = vector<vector_value_type<V>, extent<concat_size<Is...>>>;
 template<typename V, typename... Is>
 KERNEL_FLOAT_INLINE select_type<V, Is...> select(const V& input, const Is&... indices) {
     using T = vector_value_type<V>;
-    static constexpr size_t N = vector_extent<V>;
+    static constexpr size_t N = vector_size<V>;
     static constexpr size_t M = concat_size<Is...>;
 
     vector_storage<size_t, M> index_set;
@@ -2467,7 +2461,7 @@ struct copy_impl<T, N, index_sequence<Is...>> {
  */
 template<typename T, typename I, typename M = bool, typename E = broadcast_vector_extent_type<I, M>>
 KERNEL_FLOAT_INLINE vector<T, E> read(const T* ptr, const I& indices, const M& mask = true) {
-    return detail::copy_impl<T, E::value>::load(
+    return detail::copy_impl<T, extent_size<E>>::load(
         ptr,
         convert_storage<size_t>(indices, E()).data(),
         convert_storage<bool>(mask, E()).data());
@@ -2494,7 +2488,7 @@ template<
     typename M = bool,
     typename E = broadcast_vector_extent_type<V, I, M>>
 KERNEL_FLOAT_INLINE void write(T* ptr, const I& indices, const V& values, const M& mask = true) {
-    return detail::copy_impl<T, E::value>::store(
+    return detail::copy_impl<T, extent_size<E>>::store(
         ptr,
         convert_storage<T>(values, E()).data(),
         convert_storage<size_t>(indices, E()).data(),
@@ -2531,7 +2525,7 @@ KERNEL_FLOAT_INLINE vector<T, extent<N>> read(const T* ptr) {
  */
 template<typename V, typename T>
 KERNEL_FLOAT_INLINE void write(T* ptr, const V& values) {
-    static constexpr size_t N = vector_extent<V>;
+    static constexpr size_t N = vector_size<V>;
     write(ptr, range<size_t, N>(), values);
 }
 
@@ -2706,7 +2700,7 @@ KERNEL_FLOAT_INLINE vector<T, extent<N>> read_aligned(const T* ptr) {
  */
 template<size_t Align, typename V, typename T>
 KERNEL_FLOAT_INLINE void write_aligned(T* ptr, const V& values) {
-    static constexpr size_t N = vector_extent<V>;
+    static constexpr size_t N = vector_size<V>;
     static constexpr size_t alignment = detail::gcd(Align * sizeof(T), KERNEL_FLOAT_MAX_ALIGNMENT);
 
     return detail::copy_aligned_impl<T, N, alignment>::store(
@@ -3034,11 +3028,173 @@ vec_ptr(const T*) -> vec_ptr<T, 1, const T>;
 }  // namespace kernel_float
 
 #endif  //KERNEL_FLOAT_MEMORY_H
+#ifndef KERNEL_FLOAT_TRIOPS_H
+#define KERNEL_FLOAT_TRIOPS_H
+
+
+
+
+namespace kernel_float {
+
+namespace ops {
+template<typename T>
+struct conditional {
+    KERNEL_FLOAT_INLINE T operator()(bool cond, T true_value, T false_value) {
+        if (cond) {
+            return true_value;
+        } else {
+            return false_value;
+        }
+    }
+};
+}  // namespace ops
+
+/**
+ * Return elements chosen from `true_values` and `false_values` depending on `cond`.
+ *
+ * This function broadcasts all arguments to the same size and then promotes the values of `true_values` and
+ * `false_values` into the same type. Next, it casts the values of `cond` to booleans and returns a vector where
+ * the values are taken from `true_values` where the condition is true and `false_values` otherwise.
+ *
+ * @param cond The condition used for selection.
+ * @param true_values The vector of values to choose from when the condition is true.
+ * @param false_values The vector of values to choose from when the condition is false.
+ * @return A vector containing selected elements as per the condition.
+ */
+template<
+    typename C,
+    typename L,
+    typename R,
+    typename T = promoted_vector_value_type<L, R>,
+    typename E = broadcast_vector_extent_type<C, L, R>>
+KERNEL_FLOAT_INLINE vector<T, E> where(const C& cond, const L& true_values, const R& false_values) {
+    using F = ops::conditional<T>;
+    vector_storage<T, extent_size<E>> result;
+
+    detail::map_impl<F, extent_size<E>, T, bool, T, T>::call(
+        F {},
+        result.data(),
+        detail::convert_impl<vector_value_type<C>, vector_extent_type<C>, bool, E>::call(
+            into_vector_storage(cond))
+            .data(),
+        detail::convert_impl<vector_value_type<L>, vector_extent_type<L>, T, E>::call(
+            into_vector_storage(true_values))
+            .data(),
+        detail::convert_impl<vector_value_type<R>, vector_extent_type<R>, T, E>::call(
+            into_vector_storage(false_values))
+            .data());
+
+    return result;
+}
+
+/**
+ * Selects elements from `true_values` depending on `cond`.
+ *
+ * This function returns a vector where the values are taken from `true_values` where `cond` is `true` and `0` where
+ * `cond is `false`.
+ *
+ * @param cond The condition used for selection.
+ * @param true_values The vector of values to choose from when the condition is true.
+ * @return A vector containing selected elements as per the condition.
+ */
+template<
+    typename C,
+    typename L,
+    typename T = vector_value_type<L>,
+    typename E = broadcast_vector_extent_type<C, L>>
+KERNEL_FLOAT_INLINE vector<T, E> where(const C& cond, const L& true_values) {
+    vector<T, extent<1>> false_values = T {};
+    return where(cond, true_values, false_values);
+}
+
+/**
+ * Returns a vector having the value `T(1)` where `cond` is `true` and `T(0)` where `cond` is `false`.
+ *
+ * @param cond The condition used for selection.
+ * @return A vector containing elements as per the condition.
+ */
+template<typename T = bool, typename C, typename E = vector_extent_type<C>>
+KERNEL_FLOAT_INLINE vector<T, E> where(const C& cond) {
+    return cast<T>(cast<bool>(cond));
+}
+
+namespace ops {
+template<typename T>
+struct fma {
+    KERNEL_FLOAT_INLINE T operator()(T a, T b, T c) {
+        return ops::add<T> {}(ops::multiply<T> {}(a, b), c);
+    }
+};
+}  // namespace ops
+
+namespace detail {
+template<typename T, size_t N>
+struct apply_impl<ops::fma<T>, N, T, T, T, T> {
+    KERNEL_FLOAT_INLINE
+    static void call(ops::fma<T>, T* output, const T* a, const T* b, const T* c) {
+        T temp[N];
+        apply_impl<ops::multiply<T>, N, T, T, T>::call({}, temp, a, b);
+        apply_impl<ops::add<T>, N, T, T, T>::call({}, output, temp, c);
+    }
+};
+}  // namespace detail
+
+#if KERNEL_FLOAT_IS_DEVICE
+namespace ops {
+template<>
+struct fma<float> {
+    KERNEL_FLOAT_INLINE float operator()(float a, float b, float c) {
+        return __fmaf_rn(a, b, c);
+    }
+};
+
+template<>
+struct fma<double> {
+    KERNEL_FLOAT_INLINE double operator()(double a, double b, double c) {
+        return __fma_rn(a, b, c);
+    }
+};
+}  // namespace ops
+#endif
+
+/**
+ * Computes the result of `a * b + c`. This is done in a single operation if possible for the given vector type.
+ */
+template<
+    typename A,
+    typename B,
+    typename C,
+    typename T = promoted_vector_value_type<A, B, C>,
+    typename E = broadcast_vector_extent_type<A, B, C>>
+KERNEL_FLOAT_INLINE vector<T, E> fma(const A& a, const B& b, const C& c) {
+    using F = ops::fma<T>;
+    vector_storage<T, extent_size<E>> result;
+
+    detail::map_impl<F, extent_size<E>, T, T, T, T>::call(
+        F {},
+        result.data(),
+        detail::convert_impl<vector_value_type<A>, vector_extent_type<A>, T, E>::call(
+            into_vector_storage(a))
+            .data(),
+        detail::convert_impl<vector_value_type<B>, vector_extent_type<B>, T, E>::call(
+            into_vector_storage(b))
+            .data(),
+        detail::convert_impl<vector_value_type<C>, vector_extent_type<C>, T, E>::call(
+            into_vector_storage(c))
+            .data());
+
+    return result;
+}
+
+}  // namespace kernel_float
+
+#endif  //KERNEL_FLOAT_TRIOPS_H
 #ifndef KERNEL_FLOAT_REDUCE_H
 #define KERNEL_FLOAT_REDUCE_H
 
 
 
+
 namespace kernel_float {
 namespace detail {
 
@@ -3059,7 +3215,7 @@ struct reduce_recur_impl {
     template<typename F, typename T>
     KERNEL_FLOAT_INLINE static T call(F fun, const T* input) {
         vector_storage<T, K> temp;
-        apply_impl<F, N - K, T, T, T>::call(fun, temp.data(), input, input + K);
+        map_impl<F, N - K, T, T, T>::call(fun, temp.data(), input, input + K);
 
         if constexpr (N < 2 * K) {
 #pragma unroll
@@ -3109,7 +3265,7 @@ struct reduce_recur_impl<2> {
  */
 template<typename F, typename V>
 KERNEL_FLOAT_INLINE vector_value_type<V> reduce(F fun, const V& input) {
-    return detail::reduce_impl<F, vector_extent<V>, vector_value_type<V>>::call(
+    return detail::reduce_impl<F, vector_size<V>, vector_value_type<V>>::call(
         fun,
         into_vector_storage(input).data());
 }
@@ -3213,14 +3369,38 @@ template<typename T, size_t N>
 struct dot_impl {
     KERNEL_FLOAT_INLINE
     static T call(const T* left, const T* right) {
-        vector_storage<T, N> intermediate;
-        detail::apply_impl<ops::multiply<T>, N, T, T, T>::call(
-            ops::multiply<T>(),
-            intermediate.data(),
-            left,
-            right);
+        static constexpr size_t K = preferred_vector_size<T>::value;
+        T result = {};
+
+        if constexpr (N / K > 0) {
+            T accum[K] = {T {}};
+            apply_impl<ops::multiply<T>, K, T, T, T>::call({}, accum, left, right);
+
+#pragma unroll
+            for (size_t i = 1; i < N / K; i++) {
+                apply_impl<ops::fma<T>, K, T, T, T, T>::call(
+                    ops::fma<T> {},
+                    accum,
+                    left + i * K,
+                    right + i * K,
+                    accum);
+            }
+
+            result = reduce_impl<ops::add<T>, K, T>::call({}, accum);
+        }
 
-        return detail::reduce_impl<ops::add<T>, N, T>::call(ops::add<T>(), intermediate.data());
+        if constexpr (N % K > 0) {
+            for (size_t i = N - N % K; i < N; i++) {
+                apply_impl<ops::fma<T>, 1, T, T, T, T>::call(
+                    {},
+                    &result,
+                    left + i,
+                    right + i,
+                    &result);
+            }
+        }
+
+        return result;
     }
 };
 }  // namespace detail
@@ -3239,7 +3419,7 @@ struct dot_impl {
 template<typename L, typename R, typename T = promoted_vector_value_type<L, R>>
 KERNEL_FLOAT_INLINE T dot(const L& left, const R& right) {
     using E = broadcast_vector_extent_type<L, R>;
-    return detail::dot_impl<T, E::value>::call(
+    return detail::dot_impl<T, extent_size<E>>::call(
         convert_storage<T>(left, E {}).data(),
         convert_storage<T>(right, E {}).data());
 }
@@ -3309,158 +3489,11 @@ struct magnitude_impl<double, 3> {
  */
 template<typename V, typename T = vector_value_type<V>>
 KERNEL_FLOAT_INLINE T mag(const V& input) {
-    return detail::magnitude_impl<T, vector_extent<V>>::call(into_vector_storage(input).data());
+    return detail::magnitude_impl<T, vector_size<V>>::call(into_vector_storage(input).data());
 }
 }  // namespace kernel_float
 
 #endif  //KERNEL_FLOAT_REDUCE_H
-#ifndef KERNEL_FLOAT_TRIOPS_H
-#define KERNEL_FLOAT_TRIOPS_H
-
-
-
-
-namespace kernel_float {
-
-namespace ops {
-template<typename T>
-struct conditional {
-    KERNEL_FLOAT_INLINE T operator()(bool cond, T true_value, T false_value) {
-        if (cond) {
-            return true_value;
-        } else {
-            return false_value;
-        }
-    }
-};
-}  // namespace ops
-
-/**
- * Return elements chosen from `true_values` and `false_values` depending on `cond`.
- *
- * This function broadcasts all arguments to the same size and then promotes the values of `true_values` and
- * `false_values` into the same type. Next, it casts the values of `cond` to booleans and returns a vector where
- * the values are taken from `true_values` where the condition is true and `false_values` otherwise.
- *
- * @param cond The condition used for selection.
- * @param true_values The vector of values to choose from when the condition is true.
- * @param false_values The vector of values to choose from when the condition is false.
- * @return A vector containing selected elements as per the condition.
- */
-template<
-    typename C,
-    typename L,
-    typename R,
-    typename T = promoted_vector_value_type<L, R>,
-    typename E = broadcast_vector_extent_type<C, L, R>>
-KERNEL_FLOAT_INLINE vector<T, E> where(const C& cond, const L& true_values, const R& false_values) {
-    using F = ops::conditional<T>;
-    vector_storage<T, E::value> result;
-
-    detail::apply_impl<F, E::value, T, bool, T, T>::call(
-        F {},
-        result.data(),
-        detail::convert_impl<vector_value_type<C>, vector_extent_type<C>, bool, E>::call(
-            into_vector_storage(cond))
-            .data(),
-        detail::convert_impl<vector_value_type<L>, vector_extent_type<L>, T, E>::call(
-            into_vector_storage(true_values))
-            .data(),
-        detail::convert_impl<vector_value_type<R>, vector_extent_type<R>, T, E>::call(
-            into_vector_storage(false_values))
-            .data());
-
-    return result;
-}
-
-/**
- * Selects elements from `true_values` depending on `cond`.
- *
- * This function returns a vector where the values are taken from `true_values` where `cond` is `true` and `0` where
- * `cond is `false`.
- *
- * @param cond The condition used for selection.
- * @param true_values The vector of values to choose from when the condition is true.
- * @return A vector containing selected elements as per the condition.
- */
-template<
-    typename C,
-    typename L,
-    typename T = vector_value_type<L>,
-    typename E = broadcast_vector_extent_type<C, L>>
-KERNEL_FLOAT_INLINE vector<T, E> where(const C& cond, const L& true_values) {
-    vector<T, extent<1>> false_values = T {};
-    return where(cond, true_values, false_values);
-}
-
-/**
- * Returns a vector having the value `T(1)` where `cond` is `true` and `T(0)` where `cond` is `false`.
- *
- * @param cond The condition used for selection.
- * @return A vector containing elements as per the condition.
- */
-template<typename T = bool, typename C, typename E = vector_extent_type<C>>
-KERNEL_FLOAT_INLINE vector<T, E> where(const C& cond) {
-    return cast<T>(cast<bool>(cond));
-}
-
-namespace ops {
-template<typename T>
-struct fma {
-    KERNEL_FLOAT_INLINE T operator()(T a, T b, T c) {
-        return a * b + c;
-    }
-};
-
-#if KERNEL_FLOAT_IS_DEVICE
-template<>
-struct fma<float> {
-    KERNEL_FLOAT_INLINE float operator()(float a, float b, float c) {
-        return __fmaf_rn(a, b, c);
-    }
-};
-
-template<>
-struct fma<double> {
-    KERNEL_FLOAT_INLINE double operator()(double a, double b, double c) {
-        return __fma_rn(a, b, c);
-    }
-};
-#endif
-}  // namespace ops
-
-/**
- * Computes the result of `a * b + c`. This is done in a single operation if possible.
- */
-template<
-    typename A,
-    typename B,
-    typename C,
-    typename T = promoted_vector_value_type<A, B, C>,
-    typename E = broadcast_vector_extent_type<A, B, C>>
-KERNEL_FLOAT_INLINE vector<T, E> fma(const A& a, const B& b, const C& c) {
-    using F = ops::fma<T>;
-    vector_storage<T, E::value> result;
-
-    detail::apply_impl<F, E::value, T, T, T, T>::call(
-        F {},
-        result.data(),
-        detail::convert_impl<vector_value_type<A>, vector_extent_type<A>, T, E>::call(
-            into_vector_storage(a))
-            .data(),
-        detail::convert_impl<vector_value_type<B>, vector_extent_type<B>, T, E>::call(
-            into_vector_storage(b))
-            .data(),
-        detail::convert_impl<vector_value_type<C>, vector_extent_type<C>, T, E>::call(
-            into_vector_storage(c))
-            .data());
-
-    return result;
-}
-
-}  // namespace kernel_float
-
-#endif  //KERNEL_FLOAT_TRIOPS_H
 #ifndef KERNEL_FLOAT_VECTOR_H
 #define KERNEL_FLOAT_VECTOR_H
 
@@ -3517,7 +3550,7 @@ struct vector: public S {
         typename A,
         typename B,
         typename... Rest,
-        typename = enable_if_t<sizeof...(Rest) + 2 == E::size>>
+        typename = enable_if_t<sizeof...(Rest) + 2 == extent_size<E>>>
     KERNEL_FLOAT_INLINE vector(const A& a, const B& b, const Rest&... rest) :
         storage_type {T(a), T(b), T(rest)...} {}
 
@@ -3526,7 +3559,7 @@ struct vector: public S {
      */
     KERNEL_FLOAT_INLINE
     static constexpr size_t size() {
-        return E::size;
+        return extent_size<E>;
     }
 
     /**
@@ -3739,6 +3772,21 @@ struct vector: public S {
     KERNEL_FLOAT_INLINE void for_each(F fun) const {
         return kernel_float::for_each(*this, std::move(fun));
     }
+
+    /**
+     * Returns the result of `*this + lhs * rhs`.
+     *
+     * The operation is performed using a single `kernel_float::fma` call, which may be faster then perform
+     * the addition and multiplication separately.
+     */
+    template<
+        typename L,
+        typename R,
+        typename T2 = promote_t<T, vector_value_type<L>, vector_value_type<R>>,
+        typename E2 = broadcast_extent<E, vector_extent_type<L>, vector_extent_type<R>>>
+    KERNEL_FLOAT_INLINE vector<T2, E2> fma(const L& lhs, const R& rhs) const {
+        return ::kernel_float::fma(lhs, rhs, *this);
+    }
 };
 
 /**
@@ -3771,6 +3819,21 @@ template<typename T> using vec7 = vec<T, 7>;
 template<typename T> using vec8 = vec<T, 8>;
 // clang-format on
 
+#define KERNEL_FLOAT_VECTOR_ALIAS(NAME, T) \
+    template<size_t N>                     \
+    using NAME##1 = vec<T, 1>;             \
+    using NAME##2 = vec<T, 2>;             \
+    using NAME##3 = vec<T, 3>;             \
+    using NAME##4 = vec<T, 4>;             \
+    using NAME##5 = vec<T, 5>;             \
+    using NAME##6 = vec<T, 6>;             \
+    using NAME##7 = vec<T, 7>;             \
+    using NAME##8 = vec<T, 8>;
+
+KERNEL_FLOAT_VECTOR_ALIAS(int, int)
+KERNEL_FLOAT_VECTOR_ALIAS(float, float)
+KERNEL_FLOAT_VECTOR_ALIAS(double, double)
+
 /**
  * Create a vector from a variable number of input values.
  *
@@ -3816,6 +3879,12 @@ vec(Args&&... args) -> vec<promote_t<Args...>, sizeof...(Args)>;
 
 
 namespace kernel_float {
+
+template<>
+struct preferred_vector_size<__half> {
+    static constexpr size_t value = 2;
+};
+
 KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__half)
 KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __half)
 KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __half)
@@ -3972,58 +4041,10 @@ KERNEL_FLOAT_FP16_CAST(unsigned long, __ull2half_rn(input), (unsigned long)(__ha
 KERNEL_FLOAT_FP16_CAST(unsigned long long, __ull2half_rn(input), __half2ull_rz(input));
 
 using half = __half;
+KERNEL_FLOAT_VECTOR_ALIAS(half, __half)
 //KERNEL_FLOAT_TYPE_ALIAS(float16x, __half)
 //KERNEL_FLOAT_TYPE_ALIAS(f16x, __half)
 
-#if KERNEL_FLOAT_IS_DEVICE
-namespace detail {
-template<>
-struct dot_impl<__half, 0> {
-    KERNEL_FLOAT_INLINE
-    static __half call(const __half* left, const __half* right) {
-        return __half(0);
-    }
-};
-
-template<>
-struct dot_impl<__half, 1> {
-    KERNEL_FLOAT_INLINE
-    static __half call(const __half* left, const __half* right) {
-        return __hmul(left[0], right[0]);
-    }
-};
-
-template<size_t N>
-struct dot_impl<__half, N> {
-    static_assert(N >= 2, "internal error");
-
-    KERNEL_FLOAT_INLINE
-    static __half call(const __half* left, const __half* right) {
-        __half2 first_a = {left[0], left[1]};
-        __half2 first_b = {right[0], right[1]};
-        __half2 accum = __hmul2(first_a, first_b);
-
-#pragma unroll
-        for (size_t i = 2; i + 2 <= N; i += 2) {
-            __half2 a = {left[i], left[i + 1]};
-            __half2 b = {right[i], right[i + 1]};
-            accum = __hfma2(a, b, accum);
-        }
-
-        __half result = __hadd(accum.x, accum.y);
-
-        if (N % 2 != 0) {
-            __half a = left[N - 1];
-            __half b = right[N - 1];
-            result = __hfma(a, b, result);
-        }
-
-        return result;
-    }
-};
-}  // namespace detail
-#endif
-
 }  // namespace kernel_float
 
 #endif
@@ -4042,6 +4063,12 @@ struct dot_impl<__half, N> {
 
 
 namespace kernel_float {
+
+template<>
+struct preferred_vector_size<__nv_bfloat16> {
+    static constexpr size_t value = 2;
+};
+
 KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__nv_bfloat16)
 KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __nv_bfloat16)
 KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __nv_bfloat16)
@@ -4248,58 +4275,10 @@ KERNEL_FLOAT_BF16_CAST(
 #endif
 
 using bfloat16 = __nv_bfloat16;
+KERNEL_FLOAT_VECTOR_ALIAS(bfloat16x, __nv_bfloat16)
 //KERNEL_FLOAT_TYPE_ALIAS(float16x, __nv_bfloat16)
 //KERNEL_FLOAT_TYPE_ALIAS(f16x, __nv_bfloat16)
 
-#if KERNEL_FLOAT_CUDA_ARCH >= 800
-namespace detail {
-template<>
-struct dot_impl<__nv_bfloat16, 0> {
-    KERNEL_FLOAT_INLINE
-    static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) {
-        return __nv_bfloat16(0);
-    }
-};
-
-template<>
-struct dot_impl<__nv_bfloat16, 1> {
-    KERNEL_FLOAT_INLINE
-    static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) {
-        return __hmul(left[0], right[0]);
-    }
-};
-
-template<size_t N>
-struct dot_impl<__nv_bfloat16, N> {
-    static_assert(N >= 2, "internal error");
-
-    KERNEL_FLOAT_INLINE
-    static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) {
-        __nv_bfloat162 first_a = {left[0], left[1]};
-        __nv_bfloat162 first_b = {right[0], right[1]};
-        __nv_bfloat162 accum = __hmul2(first_a, first_b);
-
-#pragma unroll
-        for (size_t i = 2; i + 1 < N; i += 2) {
-            __nv_bfloat162 a = {left[i], left[i + 1]};
-            __nv_bfloat162 b = {right[i], right[i + 1]};
-            accum = __hfma2(a, b, accum);
-        }
-
-        __nv_bfloat16 result = __hadd(accum.x, accum.y);
-
-        if (N % 2 != 0) {
-            __nv_bfloat16 a = left[N - 1];
-            __nv_bfloat16 b = right[N - 1];
-            result = __hfma(a, b, result);
-        }
-
-        return result;
-    }
-};
-}  // namespace detail
-#endif
-
 }  // namespace kernel_float
 
 #if KERNEL_FLOAT_FP16_AVAILABLE