From 40e37287a15cee340af77ee95b21f877c087f15a Mon Sep 17 00:00:00 2001 From: Liqun Fu Date: Fri, 15 Nov 2024 16:39:32 -0800 Subject: [PATCH 1/8] refactor and handle beta properly Signed-off-by: Liqun Fu --- .../contrib_ops/cpu/skip_layer_norm.cc | 192 ++++++++---------- 1 file changed, 85 insertions(+), 107 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 3e70f848675cb..3b88e9068a0cb 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -96,79 +96,6 @@ void ComputeJob( } } -void ComputeJob( - const MLFloat16* input_data, - const MLFloat16* skip_data, - const float* prepacked_skip_fp32_data, - const float* gamma_float_ptr, - const float* beta_float_ptr, - const float* bias_float_ptr, - float* output_float_ptr, - ptrdiff_t task_idx, - int hidden_size, - int64_t skip_size, - float epsilon, - bool simplified, - MLFloat16* output_data, - MLFloat16* skip_input_bias_add_output_data, - AllocatorPtr alloc) { - auto offset = task_idx * hidden_size; - const MLFloat16* p_input = input_data + offset; - MLFloat16* p_output = output_data + offset; - MLFloat16* p_skip_input_bias_add_output = skip_input_bias_add_output_data == nullptr ? nullptr : skip_input_bias_add_output_data + offset; - - float mean(0.0f); - float mean_square(0.0f); - const size_t num_elems = static_cast(hidden_size); - - IAllocatorUniquePtr input_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); - MlasConvertHalfToFloatBuffer(p_input, input_float_uptr.get(), num_elems); - - IAllocatorUniquePtr skip_float_uptr = nullptr; - if (prepacked_skip_fp32_data == nullptr && skip_data) { - const MLFloat16* p_skip = skip_data + (offset % skip_size); - skip_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); - MlasConvertHalfToFloatBuffer(p_skip, skip_float_uptr.get(), num_elems); - } - - const float* input_float_ptr = input_float_uptr.get(); - const float* skip_float_ptr = prepacked_skip_fp32_data ? prepacked_skip_fp32_data : skip_float_uptr.get(); - for (size_t h = 0; h < num_elems; h++) { - float val = input_float_ptr[h] + skip_float_ptr[h]; - - if (bias_float_ptr) { - val += bias_float_ptr[h]; - } - - output_float_ptr[h] = val; - mean += val; - mean_square += val * val; - } - - if (nullptr != p_skip_input_bias_add_output) { - MlasConvertFloatToHalfBuffer(output_float_ptr, p_skip_input_bias_add_output, num_elems); - } - - mean = mean / hidden_size; - if (simplified) { - mean_square = sqrt(mean_square / hidden_size + epsilon); - } else { - mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon); - } - - for (size_t h = 0; h < num_elems; h++) { - if (simplified) { - output_float_ptr[h] = output_float_ptr[h] / mean_square * gamma_float_ptr[h]; - } else if (nullptr == beta_float_ptr) { - output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * gamma_float_ptr[h]; - } else { - output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * gamma_float_ptr[h] + beta_float_ptr[h]; - } - } - - MlasConvertFloatToHalfBuffer(output_float_ptr, p_output, num_elems); -} - void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, IAllocatorUniquePtr& dest, bool& is_packed) { if (tensor.GetElementType() == utils::ToTensorProtoElementType()) { auto tensor_data_ptr = tensor.Data(); @@ -200,8 +127,8 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { const Tensor* input = p_ctx->Input(0); const Tensor* skip = prepacked_skip_fp32_data_ ? nullptr : p_ctx->Input(1); const Tensor* gamma = prepacked_gamma_fp32_data_ ? nullptr : p_ctx->Input(2); - const Tensor* beta = prepacked_beta_fp32_data_ ? nullptr : p_ctx->Input(3); - const Tensor* bias = prepacked_bias_fp32_data_ ? nullptr : p_ctx->Input(4); + const Tensor* beta = simplified ? nullptr : (prepacked_beta_fp32_data_ ? nullptr : p_ctx->Input(3)); + const Tensor* bias = prepacked_bias_fp32_data_ ? nullptr : p_ctx->Input(simplified ? 3 : 4); Tensor* output = p_ctx->Output(0, input->Shape()); // For inferencing, we support one more optional output which is the sum of the input and skip tensors Tensor* skip_input_bias_add_output = p_ctx->Output(3, input->Shape()); @@ -232,56 +159,96 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { // For inferencing, we support one more optional output which is the sum of the input and skip tensors T* skip_input_bias_add_output_data = skip_input_bias_add_output == nullptr ? nullptr : skip_input_bias_add_output->MutableData(); - const int64_t skip_size = skip ? skip->Shape().Size() : prepacked_skip_fp32_size_; - AllocatorPtr alloc; - ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc)); - - IAllocatorUniquePtr output_fp32; - IAllocatorUniquePtr gamma_fp32; - IAllocatorUniquePtr beta_fp32; - IAllocatorUniquePtr bias_fp32; - if constexpr (std::is_same_v) { + if (skip == nullptr) { + std::cout << "missing skip"; + } + const int64_t total_data_size = input->Shape().Size(); + + AllocatorPtr alloc; + ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc)); + + IAllocatorUniquePtr input_fp32; + IAllocatorUniquePtr output_fp32; + IAllocatorUniquePtr skip_input_bias_add_output_fp32; + IAllocatorUniquePtr skip_fp32; + IAllocatorUniquePtr gamma_fp32; + IAllocatorUniquePtr beta_fp32; + IAllocatorUniquePtr bias_fp32; + + const float* input_data_f = nullptr; + const float* skip_data_f = nullptr; + const float* gamma_data_f = nullptr; + const float* beta_data_f = nullptr; + const float* bias_data_f = nullptr; + float* output_data_f = nullptr; + float* skip_input_bias_add_output_data_f = nullptr; + const size_t num_elems = static_cast(hidden_size); - output_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); + input_fp32 = IAllocator::MakeUniquePtr(alloc, total_data_size); + MlasConvertHalfToFloatBuffer(input_data, input_fp32.get(), total_data_size); + input_data_f = input_fp32.get(); + + output_fp32 = IAllocator::MakeUniquePtr(alloc, total_data_size); + output_data_f = output_fp32.get(); + + skip_input_bias_add_output_fp32 = IAllocator::MakeUniquePtr(alloc, total_data_size); + skip_input_bias_add_output_data_f = skip_input_bias_add_output_fp32.get(); + + if (skip_data) { + skip_fp32 = IAllocator::MakeUniquePtr(alloc, skip_size); + MlasConvertHalfToFloatBuffer(skip_data, skip_fp32.get(), skip_size); + skip_data_f = skip_fp32.get(); + } else if(prepacked_skip_fp32_data_) { + skip_data_f = prepacked_skip_fp32_data_.get(); + } - if (prepacked_gamma_fp32_data_ == nullptr && gamma_data) { + if (gamma_data) { gamma_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); MlasConvertHalfToFloatBuffer(gamma_data, gamma_fp32.get(), num_elems); + gamma_data_f = gamma_fp32.get(); + } else if(prepacked_gamma_fp32_data_) { + gamma_data_f = prepacked_gamma_fp32_data_.get(); } - if (prepacked_beta_fp32_data_ == nullptr && beta_data) { + if (beta_data) { beta_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); MlasConvertHalfToFloatBuffer(beta_data, beta_fp32.get(), num_elems); + beta_data_f = beta_fp32.get(); + } else if (prepacked_beta_fp32_data_) { + beta_data_f = prepacked_beta_fp32_data_.get(); } - if (prepacked_bias_fp32_data_ == nullptr && bias_data) { + if (bias_data) { bias_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); MlasConvertHalfToFloatBuffer(bias_data, bias_fp32.get(), num_elems); + bias_data_f = bias_fp32.get(); + } else if (prepacked_bias_fp32_data_) { + bias_data_f = prepacked_bias_fp32_data_.get(); } - } - concurrency::ThreadPool::TryBatchParallelFor( - p_ctx->GetOperatorThreadPool(), static_cast(task_count), - [&](ptrdiff_t task_idx) { - if constexpr (std::is_same_v) { - ComputeJob(input_data, skip_data, - prepacked_skip_fp32_data_.get(), - prepacked_gamma_fp32_data_ ? prepacked_gamma_fp32_data_.get() : gamma_fp32.get(), - prepacked_beta_fp32_data_ ? prepacked_beta_fp32_data_.get() : beta_fp32.get(), - prepacked_bias_fp32_data_ ? prepacked_bias_fp32_data_.get() : bias_fp32.get(), - output_fp32.get(), - task_idx, hidden_size, skip_size, epsilon_, simplified, output_data, - skip_input_bias_add_output_data, alloc); - } else { + concurrency::ThreadPool::TryBatchParallelFor( + p_ctx->GetOperatorThreadPool(), static_cast(task_count), + [&](ptrdiff_t task_idx) { + ComputeJob(input_data_f, skip_data_f, gamma_data_f, beta_data_f, bias_data_f, task_idx, hidden_size, skip_size, + epsilon_, simplified, output_data_f, skip_input_bias_add_output_data_f); + }, + 0); + MlasConvertFloatToHalfBuffer(output_data_f, output_data, total_data_size); + if (skip_input_bias_add_output_data != nullptr) + MlasConvertFloatToHalfBuffer(skip_input_bias_add_output_data_f, skip_input_bias_add_output_data, total_data_size); + } else { + concurrency::ThreadPool::TryBatchParallelFor( + p_ctx->GetOperatorThreadPool(), static_cast(task_count), + [&](ptrdiff_t task_idx) { ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, task_idx, hidden_size, skip_size, epsilon_, simplified, output_data, skip_input_bias_add_output_data); - } - }, - 0); + }, + 0); + } return Status::OK(); } @@ -290,6 +257,9 @@ template Status SkipLayerNorm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool& is_packed, PrePackedWeights* prepacked_weights) { ORT_UNUSED_PARAMETER(prepacked_weights); + ORT_UNUSED_PARAMETER(tensor); + ORT_UNUSED_PARAMETER(input_idx); + ORT_UNUSED_PARAMETER(alloc); is_packed = false; if (input_idx == 1) { // skip @@ -297,9 +267,17 @@ Status SkipLayerNorm::PrePack(const Tensor& tensor, int input_idx ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_skip_fp32_data_, is_packed); } else if (input_idx == 2) { // gamma ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_gamma_fp32_data_, is_packed); - } else if (input_idx == 3) { // beta - ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_beta_fp32_data_, is_packed); + } else if (input_idx == 3) { + if (simplified) + { + // bias + ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_bias_fp32_data_, is_packed); + } else { + // beta + ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_beta_fp32_data_, is_packed); + } } else if (input_idx == 4) { // bias + ORT_ENFORCE(!simplified, "SkipSimplifiedLayerNormalization should only has 4 inputs (input, skip, gamma, and beta). Got 5."); ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_bias_fp32_data_, is_packed); } From d4020b38a483e7f95de89b4d589b6a37d5e90b91 Mon Sep 17 00:00:00 2001 From: liqun Fu Date: Fri, 15 Nov 2024 16:47:59 -0800 Subject: [PATCH 2/8] Update onnxruntime/contrib_ops/cpu/skip_layer_norm.cc Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 3b88e9068a0cb..06563b17ddb51 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -202,7 +202,7 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { skip_fp32 = IAllocator::MakeUniquePtr(alloc, skip_size); MlasConvertHalfToFloatBuffer(skip_data, skip_fp32.get(), skip_size); skip_data_f = skip_fp32.get(); - } else if(prepacked_skip_fp32_data_) { + } else if (prepacked_skip_fp32_data_) { skip_data_f = prepacked_skip_fp32_data_.get(); } From a0c07f28c8aec9c90aef08c7340f230fb6427792 Mon Sep 17 00:00:00 2001 From: liqun Fu Date: Fri, 15 Nov 2024 16:48:07 -0800 Subject: [PATCH 3/8] Update onnxruntime/contrib_ops/cpu/skip_layer_norm.cc Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 06563b17ddb51..602ba74eccb74 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -210,7 +210,7 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { gamma_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); MlasConvertHalfToFloatBuffer(gamma_data, gamma_fp32.get(), num_elems); gamma_data_f = gamma_fp32.get(); - } else if(prepacked_gamma_fp32_data_) { + } else if (prepacked_gamma_fp32_data_) { gamma_data_f = prepacked_gamma_fp32_data_.get(); } From 8b81c2495887166bbb6285faa0f593943d396bef Mon Sep 17 00:00:00 2001 From: liqun Fu Date: Fri, 15 Nov 2024 16:48:15 -0800 Subject: [PATCH 4/8] Update onnxruntime/contrib_ops/cpu/skip_layer_norm.cc Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 602ba74eccb74..6c6c181813b2f 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -268,8 +268,7 @@ Status SkipLayerNorm::PrePack(const Tensor& tensor, int input_idx } else if (input_idx == 2) { // gamma ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_gamma_fp32_data_, is_packed); } else if (input_idx == 3) { - if (simplified) - { + if (simplified) { // bias ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_bias_fp32_data_, is_packed); } else { From ada93ead32092a38d9f4a0896b242f5df87f7618 Mon Sep 17 00:00:00 2001 From: Liqun Fu Date: Fri, 15 Nov 2024 16:50:26 -0800 Subject: [PATCH 5/8] remove unneeded Signed-off-by: Liqun Fu --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 6c6c181813b2f..5d213ee892ed2 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -257,9 +257,6 @@ template Status SkipLayerNorm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool& is_packed, PrePackedWeights* prepacked_weights) { ORT_UNUSED_PARAMETER(prepacked_weights); - ORT_UNUSED_PARAMETER(tensor); - ORT_UNUSED_PARAMETER(input_idx); - ORT_UNUSED_PARAMETER(alloc); is_packed = false; if (input_idx == 1) { // skip From e73eaf4a2db449d8bb06003d4d88770070e49b09 Mon Sep 17 00:00:00 2001 From: Liqun Fu Date: Fri, 15 Nov 2024 20:20:04 -0800 Subject: [PATCH 6/8] lint Signed-off-by: Liqun Fu --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 5d213ee892ed2..69e01dc89eed7 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -162,9 +162,6 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { const int64_t skip_size = skip ? skip->Shape().Size() : prepacked_skip_fp32_size_; if constexpr (std::is_same_v) { - if (skip == nullptr) { - std::cout << "missing skip"; - } const int64_t total_data_size = input->Shape().Size(); AllocatorPtr alloc; @@ -188,19 +185,19 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { const size_t num_elems = static_cast(hidden_size); - input_fp32 = IAllocator::MakeUniquePtr(alloc, total_data_size); + input_fp32 = IAllocator::MakeUniquePtr(alloc, static_cast(total_data_size)); MlasConvertHalfToFloatBuffer(input_data, input_fp32.get(), total_data_size); input_data_f = input_fp32.get(); - output_fp32 = IAllocator::MakeUniquePtr(alloc, total_data_size); + output_fp32 = IAllocator::MakeUniquePtr(alloc, static_cast(total_data_size)); output_data_f = output_fp32.get(); - skip_input_bias_add_output_fp32 = IAllocator::MakeUniquePtr(alloc, total_data_size); + skip_input_bias_add_output_fp32 = IAllocator::MakeUniquePtr(alloc, static_cast(total_data_size)); skip_input_bias_add_output_data_f = skip_input_bias_add_output_fp32.get(); if (skip_data) { - skip_fp32 = IAllocator::MakeUniquePtr(alloc, skip_size); - MlasConvertHalfToFloatBuffer(skip_data, skip_fp32.get(), skip_size); + skip_fp32 = IAllocator::MakeUniquePtr(alloc, static_cast(skip_size)); + MlasConvertHalfToFloatBuffer(skip_data, skip_fp32.get(), static_cast(skip_size)); skip_data_f = skip_fp32.get(); } else if (prepacked_skip_fp32_data_) { skip_data_f = prepacked_skip_fp32_data_.get(); @@ -237,9 +234,9 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { epsilon_, simplified, output_data_f, skip_input_bias_add_output_data_f); }, 0); - MlasConvertFloatToHalfBuffer(output_data_f, output_data, total_data_size); + MlasConvertFloatToHalfBuffer(output_data_f, output_data, static_cast(total_data_size)); if (skip_input_bias_add_output_data != nullptr) - MlasConvertFloatToHalfBuffer(skip_input_bias_add_output_data_f, skip_input_bias_add_output_data, total_data_size); + MlasConvertFloatToHalfBuffer(skip_input_bias_add_output_data_f, skip_input_bias_add_output_data, static_cast(total_data_size)); } else { concurrency::ThreadPool::TryBatchParallelFor( p_ctx->GetOperatorThreadPool(), static_cast(task_count), From 07cc88f9f7aa7542a528e68fc0082da2a479ba6c Mon Sep 17 00:00:00 2001 From: Liqun Fu Date: Fri, 15 Nov 2024 20:46:29 -0800 Subject: [PATCH 7/8] lint Signed-off-by: Liqun Fu --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 69e01dc89eed7..39abe6c560d37 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -162,7 +162,7 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { const int64_t skip_size = skip ? skip->Shape().Size() : prepacked_skip_fp32_size_; if constexpr (std::is_same_v) { - const int64_t total_data_size = input->Shape().Size(); + const size_t total_data_size = static_cast(input->Shape().Size()); AllocatorPtr alloc; ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc)); @@ -185,14 +185,14 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { const size_t num_elems = static_cast(hidden_size); - input_fp32 = IAllocator::MakeUniquePtr(alloc, static_cast(total_data_size)); + input_fp32 = IAllocator::MakeUniquePtr(alloc, total_data_size); MlasConvertHalfToFloatBuffer(input_data, input_fp32.get(), total_data_size); input_data_f = input_fp32.get(); - output_fp32 = IAllocator::MakeUniquePtr(alloc, static_cast(total_data_size)); + output_fp32 = IAllocator::MakeUniquePtr(alloc, total_data_size); output_data_f = output_fp32.get(); - skip_input_bias_add_output_fp32 = IAllocator::MakeUniquePtr(alloc, static_cast(total_data_size)); + skip_input_bias_add_output_fp32 = IAllocator::MakeUniquePtr(alloc, total_data_size); skip_input_bias_add_output_data_f = skip_input_bias_add_output_fp32.get(); if (skip_data) { @@ -234,9 +234,9 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { epsilon_, simplified, output_data_f, skip_input_bias_add_output_data_f); }, 0); - MlasConvertFloatToHalfBuffer(output_data_f, output_data, static_cast(total_data_size)); + MlasConvertFloatToHalfBuffer(output_data_f, output_data, total_data_size); if (skip_input_bias_add_output_data != nullptr) - MlasConvertFloatToHalfBuffer(skip_input_bias_add_output_data_f, skip_input_bias_add_output_data, static_cast(total_data_size)); + MlasConvertFloatToHalfBuffer(skip_input_bias_add_output_data_f, skip_input_bias_add_output_data, total_data_size); } else { concurrency::ThreadPool::TryBatchParallelFor( p_ctx->GetOperatorThreadPool(), static_cast(task_count), @@ -262,7 +262,7 @@ Status SkipLayerNorm::PrePack(const Tensor& tensor, int input_idx } else if (input_idx == 2) { // gamma ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_gamma_fp32_data_, is_packed); } else if (input_idx == 3) { - if (simplified) { + if constexpr (simplified) { // bias ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_bias_fp32_data_, is_packed); } else { From 8628147b4c8d0d575c24525b881d859d3d5f38cb Mon Sep 17 00:00:00 2001 From: Liqun Fu Date: Sun, 17 Nov 2024 11:20:58 -0800 Subject: [PATCH 8/8] remove a blank line Signed-off-by: Liqun Fu --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 39abe6c560d37..d5b8961cf8c5a 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -254,7 +254,6 @@ template Status SkipLayerNorm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool& is_packed, PrePackedWeights* prepacked_weights) { ORT_UNUSED_PARAMETER(prepacked_weights); - is_packed = false; if (input_idx == 1) { // skip prepacked_skip_fp32_size_ = tensor.Shape().Size();