Skip to content

Commit

Permalink
fix cast armv7 sigbus when loading fp16 model (#5292)
Browse files Browse the repository at this point in the history
* fix sigbus error when loading fp16 model on armv7

* apply for bf16
  • Loading branch information
nihui authored Jan 17, 2024
1 parent a705a24 commit 656b082
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 16 deletions.
4 changes: 2 additions & 2 deletions src/layer/arm/cast_bf16.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ static void cast_fp32_to_bf16_neon(const Mat& bottom_blob, Mat& top_blob, const
"vshrn.u32 d1, q1, #16 \n"
"vshrn.u32 d2, q2, #16 \n"
"vshrn.u32 d3, q3, #16 \n"
"vst1.u16 {d0-d3}, [%1 :128]! \n"
"vst1.u16 {d0-d3}, [%1]! \n"
: "=r"(ptr), // %0
"=r"(outptr) // %1
: "0"(ptr),
Expand Down Expand Up @@ -231,7 +231,7 @@ static void cast_bf16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const
#else // __aarch64__
asm volatile(
"pld [%0, #256] \n"
"vld1.u16 {d4-d7}, [%0 :128]! \n"
"vld1.u16 {d4-d7}, [%0]! \n"
"vshll.u16 q0, d4, #16 \n"
"vshll.u16 q1, d5, #16 \n"
"vshll.u16 q2, d6, #16 \n"
Expand Down
28 changes: 14 additions & 14 deletions src/layer/arm/cast_fp16.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const
: "memory", "v0", "v1", "v2", "v3");
#else // __aarch64__
asm volatile(
"pld [%0, #512] \n"
"vldm %0!, {d0-d7} \n"
"vcvt.f16.f32 d0, q0 \n"
"vcvt.f16.f32 d1, q1 \n"
"vcvt.f16.f32 d2, q2 \n"
"vcvt.f16.f32 d3, q3 \n"
"vst1.u16 {d0-d3}, [%1 :128]! \n"
"pld [%0, #512] \n"
"vldm %0!, {d0-d7} \n"
"vcvt.f16.f32 d0, q0 \n"
"vcvt.f16.f32 d1, q1 \n"
"vcvt.f16.f32 d2, q2 \n"
"vcvt.f16.f32 d3, q3 \n"
"vst1.u16 {d0-d3}, [%1]! \n"
: "=r"(ptr), // %0
"=r"(outptr) // %1
: "0"(ptr),
Expand Down Expand Up @@ -220,13 +220,13 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const
: "memory", "v0", "v1", "v2", "v3");
#else // __aarch64__
asm volatile(
"pld [%0, #256] \n"
"vld1.u16 {d4-d7}, [%0 :128]! \n"
"vcvt.f32.f16 q0, d4 \n"
"vcvt.f32.f16 q1, d5 \n"
"vcvt.f32.f16 q2, d6 \n"
"vcvt.f32.f16 q3, d7 \n"
"vstm %1!, {d0-d7} \n"
"pld [%0, #256] \n"
"vld1.u16 {d4-d7}, [%0]! \n"
"vcvt.f32.f16 q0, d4 \n"
"vcvt.f32.f16 q1, d5 \n"
"vcvt.f32.f16 q2, d6 \n"
"vcvt.f32.f16 q3, d7 \n"
"vstm %1!, {d0-d7} \n"
: "=r"(ptr), // %0
"=r"(outptr) // %1
: "0"(ptr),
Expand Down

0 comments on commit 656b082

Please sign in to comment.