Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prelu layer uses sse instruction _mm_load_ps but data can be misaligned so it must use _mm_loadu_ps #5149

Merged
merged 3 commits into from
Nov 15, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/layer/x86/prelu_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ int PReLU_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int i = remain_size_start + ii * 4;
__m128 _p128 = _mm_load_ps(ptr + i);
__m128 _slope128 = _mm_load_ps(slope + i);
__m128 _slope128 = _mm_loadu_ps(slope + i);
_mm_store_ps(ptr + i, prelu_sse(_p128, _slope128));
}
remain_size_start += nn_size * 4;
Expand Down Expand Up @@ -157,7 +157,7 @@ int PReLU_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const

float slope = num_slope > 1 ? slope_data[i] : slope_data[0];
#if __SSE2__
__m128 _slope128 = num_slope > 1 && (elempack == 4) ? _mm_load_ps((const float*)slope_data + i * 4) : _mm_set1_ps(slope);
__m128 _slope128 = num_slope > 1 && (elempack == 4) ? _mm_loadu_ps((const float*)slope_data + i * 4) : _mm_set1_ps(slope);
#if __AVX__
__m256 _slope256 = num_slope > 1 && (elempack == 8) ? _mm256_loadu_ps((const float*)slope_data + i * 8) : _mm256_insertf128_ps(_mm256_castps128_ps256(_slope128), _slope128, 1);
#if __AVX512F__
Expand Down Expand Up @@ -205,7 +205,7 @@ int PReLU_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const

float slope = num_slope > 1 ? slope_data[q] : slope_data[0];
#if __SSE2__
__m128 _slope128 = num_slope > 1 && (elempack == 4) ? _mm_load_ps((const float*)slope_data + q * 4) : _mm_set1_ps(slope);
__m128 _slope128 = num_slope > 1 && (elempack == 4) ? _mm_loadu_ps((const float*)slope_data + q * 4) : _mm_set1_ps(slope);
#if __AVX__
__m256 _slope256 = num_slope > 1 && (elempack == 8) ? _mm256_loadu_ps((const float*)slope_data + q * 8) : _mm256_insertf128_ps(_mm256_castps128_ps256(_slope128), _slope128, 1);
#if __AVX512F__
Expand Down
Loading