vllm-project · rkooo567 · May 22, 2024 · May 9, 2024 · May 10, 2024 · May 10, 2024
diff --git a/.clang-format b/.clang-format
@@ -0,0 +1,23 @@
+BasedOnStyle: Google
+UseTab: Never
+IndentWidth: 2
+ColumnLimit: 100
+
+# Force pointers to the type for C++.
+DerivePointerAlignment: false
+PointerAlignment: Left
+
+AlignConsecutiveAssignments: true
+AlignConsecutiveDeclarations: true
+AlignOperands: true
+AlwaysBreakTemplateDeclarations: Yes
+
+IncludeCategories:
+  - Regex:           '^<'
+    Priority:        4
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
+    Priority:        3
+  - Regex:           '^"(qoda|\.\.)/'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        1
diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
@@ -1,161 +1,143 @@
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+
 #include <ATen/cuda/CUDAContext.h>
-#include <torch/extension.h>
 #include <c10/cuda/CUDAGuard.h>
-
 #include <cmath>
-
-#include "cuda_compat.h"
-#include "dispatch_utils.h"
+#include <torch/extension.h>
 
 namespace vllm {
 
 // Activation and gating kernel template.
-template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
-__global__ void act_and_mul_kernel(
-  scalar_t* __restrict__ out,               // [..., d]
-  const scalar_t* __restrict__ input,       // [..., 2, d]
-  const int d) {
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
+__global__ void act_and_mul_kernel(scalar_t* __restrict__ out,          // [..., d]
+                                   const scalar_t* __restrict__ input,  // [..., 2, d]
+                                   const int d) {
   const int64_t token_idx = blockIdx.x;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
-    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
+    const scalar_t x         = VLLM_LDG(&input[token_idx * 2 * d + idx]);
+    const scalar_t y         = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
     out[token_idx * d + idx] = ACT_FN(x) * y;
   }
 }
 
-template<typename T>
+template <typename T>
 __device__ __forceinline__ T silu_kernel(const T& x) {
   // x * sigmoid(x)
-  return (T) (((float) x) / (1.0f + expf((float) -x)));
+  return (T)(((float)x) / (1.0f + expf((float)-x)));
 }
 
-template<typename T>
+template <typename T>
 __device__ __forceinline__ T gelu_kernel(const T& x) {
   // Equivalent to PyTorch GELU with 'none' approximation.
   // Refer to:
   // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
-  const float f = (float) x;
+  const float     f     = (float)x;
   constexpr float ALPHA = M_SQRT1_2;
-  return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));
+  return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA)));
 }
 
-template<typename T>
+template <typename T>
 __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
   // Equivalent to PyTorch GELU with 'tanh' approximation.
   // Refer to:
   // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
-  const float f = (float) x;
-  constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
-  constexpr float KAPPA = 0.044715;
-  float x_cube = f * f * f;
-  float inner = BETA * (f + KAPPA * x_cube);
-  return (T) (0.5f * f * (1.0f + ::tanhf(inner)));
+  const float     f      = (float)x;
+  constexpr float BETA   = M_SQRT2 * M_2_SQRTPI * 0.5f;
+  constexpr float KAPPA  = 0.044715;
+  float           x_cube = f * f * f;
+  float           inner  = BETA * (f + KAPPA * x_cube);
+  return (T)(0.5f * f * (1.0f + ::tanhf(inner)));
 }
 
-} // namespace vllm
+}  // namespace vllm
 
 // Launch activation and gating kernel.
-#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                                             \
-  int d = input.size(-1) / 2;                                                             \
-  int64_t num_tokens = input.numel() / input.size(-1);                                    \
-  dim3 grid(num_tokens);                                                                  \
-  dim3 block(std::min(d, 1024));                                                          \
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));                       \
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                           \
-  VLLM_DISPATCH_FLOATING_TYPES(                                                           \
-    input.scalar_type(),                                                                  \
-    "act_and_mul_kernel",                                                                 \
-    [&] {                                                                                 \
-      vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>(   \
-        out.data_ptr<scalar_t>(),                                                         \
-        input.data_ptr<scalar_t>(),                                                       \
-        d);                                                                               \
-    });
-
-void silu_and_mul(
-  torch::Tensor& out,      // [..., d]
-  torch::Tensor& input)    // [..., 2 * d]
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                                                  \
+  int                               d          = input.size(-1) / 2;                           \
+  int64_t                           num_tokens = input.numel() / input.size(-1);               \
+  dim3                              grid(num_tokens);                                          \
+  dim3                              block(std::min(d, 1024));                                  \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));                            \
+  const cudaStream_t                stream = at::cuda::getCurrentCUDAStream();                 \
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "act_and_mul_kernel", [&] {                \
+    vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>>                                       \
+        <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d); \
+  });
+
+void silu_and_mul(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
 {
   LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
 }
 
-void gelu_and_mul(
-  torch::Tensor& out,      // [..., d]
-  torch::Tensor& input)    // [..., 2 * d]
+void gelu_and_mul(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
 {
   LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
 }
 
-void gelu_tanh_and_mul(
-  torch::Tensor& out,      // [..., d]
-  torch::Tensor& input)    // [..., 2 * d]
+void gelu_tanh_and_mul(torch::Tensor& out,    // [..., d]
+                       torch::Tensor& input)  // [..., 2 * d]
 {
   LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
 }
 
 namespace vllm {
 
 // Element-wise activation kernel template.
-template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
-__global__ void activation_kernel(
-  scalar_t* __restrict__ out,               // [..., d]
-  const scalar_t* __restrict__ input,       // [..., d]
-  const int d) {
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
+__global__ void activation_kernel(scalar_t* __restrict__ out,          // [..., d]
+                                  const scalar_t* __restrict__ input,  // [..., d]
+                                  const int d) {
   const int64_t token_idx = blockIdx.x;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
+    const scalar_t x         = VLLM_LDG(&input[token_idx * d + idx]);
     out[token_idx * d + idx] = ACT_FN(x);
   }
 }
 
-} // namespace vllm
+}  // namespace vllm
 
 // Launch element-wise activation kernel.
-#define LAUNCH_ACTIVATION_KERNEL(KERNEL)                                                  \
-  int d = input.size(-1);                                                                 \
-  int64_t num_tokens = input.numel() / d;                                                 \
-  dim3 grid(num_tokens);                                                                  \
-  dim3 block(std::min(d, 1024));                                                          \
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));                       \
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                           \
-  VLLM_DISPATCH_FLOATING_TYPES(                                                           \
-    input.scalar_type(),                                                                  \
-    "activation_kernel",                                                                  \
-    [&] {                                                                                 \
-      vllm::activation_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>(    \
-        out.data_ptr<scalar_t>(),                                                         \
-        input.data_ptr<scalar_t>(),                                                       \
-        d);                                                                               \
-    });
+#define LAUNCH_ACTIVATION_KERNEL(KERNEL)                                                       \
+  int                               d          = input.size(-1);                               \
+  int64_t                           num_tokens = input.numel() / d;                            \
+  dim3                              grid(num_tokens);                                          \
+  dim3                              block(std::min(d, 1024));                                  \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));                            \
+  const cudaStream_t                stream = at::cuda::getCurrentCUDAStream();                 \
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] {                 \
+    vllm::activation_kernel<scalar_t, KERNEL<scalar_t>>                                        \
+        <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d); \
+  });
 
 namespace vllm {
 
-template<typename T>
+template <typename T>
 __device__ __forceinline__ T gelu_new_kernel(const T& x) {
-  const float x3 = (float) (x * x * x);
-  const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3))));
-  return ((T) 0.5) * x * (((T) 1.0) + t);
+  const float x3 = (float)(x * x * x);
+  const T     t  = (T)tanhf((T)(0.79788456f * (float)(x + (T)(0.044715f * x3))));
+  return ((T)0.5) * x * (((T)1.0) + t);
 }
 
-template<typename T>
+template <typename T>
 __device__ __forceinline__ T gelu_fast_kernel(const T& x) {
-  const float f = (float) x;
-  const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x));
-  return ((T) 0.5) * x * (((T) 1.0) + t);
+  const float f = (float)x;
+  const T     t = (T)tanhf(((T)(f * 0.79788456f)) * (((T)1.0) + (T)(0.044715f * f) * x));
+  return ((T)0.5) * x * (((T)1.0) + t);
 }
 
-} // namespace vllm
+}  // namespace vllm
 
-void gelu_new(
-  torch::Tensor& out,     // [..., d]
-  torch::Tensor& input)   // [..., d]
+void gelu_new(torch::Tensor& out,    // [..., d]
+              torch::Tensor& input)  // [..., d]
 {
   LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
 }
 
-void gelu_fast(
-  torch::Tensor& out,     // [..., d]
-  torch::Tensor& input)   // [..., d]
+void gelu_fast(torch::Tensor& out,    // [..., d]
+               torch::Tensor& input)  // [..., d]
 {
   LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
 }
diff --git a/csrc/attention/attention_dtypes.h b/csrc/attention/attention_dtypes.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include "attention_generic.cuh"
+#include "dtype_bfloat16.cuh"
 #include "dtype_float16.cuh"
 #include "dtype_float32.cuh"
-#include "dtype_bfloat16.cuh"
 #include "dtype_fp8.cuh"