[AArch64] Lower extending sitofp using tbl #92528

momchil-velikov · 2024-05-17T12:00:54Z

In a similar manner as in https://reviews.llvm.org/D133494
use TBL to place bytes in the upper part of i32 elements
and then convert to float using fixed-point scvtf, i.e.

scvtf Vd.4s, Vn.4s, #24

llvmbot · 2024-05-17T12:01:26Z

@llvm/pr-subscribers-backend-aarch64

Author: Momchil Velikov (momchil-velikov)

Changes

In a similar manner as in https://reviews.llvm.org/D133494
use TBL to place bytes in the upper part of i32 elements
and then convert to float using fixed-point scvtf, i.e.

scvtf Vd.4s, Vn.4s, #<!-- -->24

Full diff: https://github.com/llvm/llvm-project/pull/92528.diff

3 Files Affected:

(modified) llvm/lib/CodeGen/CodeGenPrepare.cpp (+2-1)
(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+108-38)
(added) llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll (+196)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 339a1f1f2f002..4c52dbfa23903 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8333,7 +8333,8 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
     if (OptimizeNoopCopyExpression(CI, *TLI, *DL))
       return true;
 
-    if ((isa<UIToFPInst>(I) || isa<FPToUIInst>(I) || isa<TruncInst>(I)) &&
+    if ((isa<UIToFPInst>(I) || isa<SIToFPInst>(I) || isa<FPToUIInst>(I) ||
+         isa<TruncInst>(I)) &&
         TLI->optimizeExtendOrTruncateConversion(
             I, LI->getLoopFor(I->getParent()), *TTI))
       return true;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1e0071fffe666..619616869b8b3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15691,48 +15691,69 @@ bool AArch64TargetLowering::shouldSinkOperands(
   return false;
 }
 
-static bool createTblShuffleForZExt(ZExtInst *ZExt, FixedVectorType *DstTy,
-                                    bool IsLittleEndian) {
-  Value *Op = ZExt->getOperand(0);
-  auto *SrcTy = cast<FixedVectorType>(Op->getType());
-  auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
-  auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
+static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
+                                 unsigned NumElts, bool IsLittleEndian,
+                                 SmallVectorImpl<int> &Mask) {
   if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64)
     return false;
 
-  assert(DstWidth % SrcWidth == 0 &&
-         "TBL lowering is not supported for a ZExt instruction with this "
-         "source & destination element type.");
-  unsigned ZExtFactor = DstWidth / SrcWidth;
+  if (DstWidth % SrcWidth != 0)
+    return false;
+
+  unsigned Factor = DstWidth / SrcWidth;
+  unsigned MaskLen = NumElts * Factor;
+
+  Mask.clear();
+  Mask.resize(MaskLen, NumElts);
+
+  unsigned SrcIndex = 0;
+  for (unsigned I = 0; I < MaskLen; I += Factor)
+    Mask[I] = SrcIndex++;
+
+  if (!IsLittleEndian)
+    std::rotate(Mask.rbegin(), Mask.rbegin() + Factor - 1, Mask.rend());
+
+  return true;
+}
+
+static Value *createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op,
+                                      FixedVectorType *ZExtTy,
+                                      FixedVectorType *DstTy,
+                                      bool IsLittleEndian) {
+  auto *SrcTy = cast<FixedVectorType>(Op->getType());
   unsigned NumElts = SrcTy->getNumElements();
-  IRBuilder<> Builder(ZExt);
+  auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
+  auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
+
   SmallVector<int> Mask;
-  // Create a mask that selects <0,...,Op[i]> for each lane of the destination
-  // vector to replace the original ZExt. This can later be lowered to a set of
-  // tbl instructions.
-  for (unsigned i = 0; i < NumElts * ZExtFactor; i++) {
-    if (IsLittleEndian) {
-      if (i % ZExtFactor == 0)
-        Mask.push_back(i / ZExtFactor);
-      else
-        Mask.push_back(NumElts);
-    } else {
-      if ((i + 1) % ZExtFactor == 0)
-        Mask.push_back((i - ZExtFactor + 1) / ZExtFactor);
-      else
-        Mask.push_back(NumElts);
-    }
-  }
+  if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
+    return nullptr;
 
   auto *FirstEltZero = Builder.CreateInsertElement(
       PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
   Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
   Result = Builder.CreateBitCast(Result, DstTy);
-  if (DstTy != ZExt->getType())
-    Result = Builder.CreateZExt(Result, ZExt->getType());
-  ZExt->replaceAllUsesWith(Result);
-  ZExt->eraseFromParent();
-  return true;
+  if (DstTy != ZExtTy)
+    Result = Builder.CreateZExt(Result, ZExtTy);
+  return Result;
+}
+
+static Value *createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op,
+                                      FixedVectorType *DstTy,
+                                      bool IsLittleEndian) {
+  auto *SrcTy = cast<FixedVectorType>(Op->getType());
+  auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
+  auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
+
+  SmallVector<int> Mask;
+  if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
+                            !IsLittleEndian, Mask))
+    return nullptr;
+
+  auto *FirstEltZero = Builder.CreateInsertElement(
+      PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
+
+  return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
 }
 
 static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
@@ -15897,21 +15918,47 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
 
       DstTy = TruncDstType;
     }
-
-    return createTblShuffleForZExt(ZExt, DstTy, Subtarget->isLittleEndian());
+    IRBuilder<> Builder(ZExt);
+    Value *Result = createTblShuffleForZExt(
+        Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
+        DstTy, Subtarget->isLittleEndian());
+    if (!Result)
+      return false;
+    ZExt->replaceAllUsesWith(Result);
+    ZExt->eraseFromParent();
+    return true;
   }
 
   auto *UIToFP = dyn_cast<UIToFPInst>(I);
   if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
       DstTy->getElementType()->isFloatTy()) {
     IRBuilder<> Builder(I);
-    auto *ZExt = cast<ZExtInst>(
-        Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy)));
+    Value *ZExt = createTblShuffleForZExt(
+        Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
+        FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
+    if (!ZExt)
+      return false;
     auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
     I->replaceAllUsesWith(UI);
     I->eraseFromParent();
-    return createTblShuffleForZExt(ZExt, cast<FixedVectorType>(ZExt->getType()),
-                                   Subtarget->isLittleEndian());
+    return true;
+  }
+
+  auto *SIToFP = dyn_cast<SIToFPInst>(I);
+  if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
+      DstTy->getElementType()->isFloatTy()) {
+    IRBuilder<> Builder(I);
+    auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
+                                            FixedVectorType::getInteger(DstTy),
+                                            Subtarget->isLittleEndian());
+    if (!Shuffle)
+      return false;
+    auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
+    auto *AShr = Builder.CreateAShr(Cast, 24);
+    auto *SI = Builder.CreateSIToFP(AShr, DstTy);
+    I->replaceAllUsesWith(SI);
+    I->eraseFromParent();
+    return true;
   }
 
   // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
@@ -17840,6 +17887,26 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
   return SDValue();
 }
 
+static SDValue performVectorIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() != ISD::SINT_TO_FP || N->getValueType(0) != MVT::v4f32)
+    return SDValue();
+
+  SDNode *VASHR = N->getOperand(0).getNode();
+  if (VASHR->getOpcode() != AArch64ISD::VASHR ||
+      VASHR->getValueType(0) != MVT::v4i32)
+    return SDValue();
+
+  if (!isa<ConstantSDNode>(VASHR->getOperand(1).getNode()) ||
+      VASHR->getConstantOperandVal(1) != 24)
+    return SDValue();
+
+  return SDValue(DAG.getMachineNode(
+                     AArch64::SCVTFv4i32_shift, SDLoc(N), N->getValueType(0),
+                     {VASHR->getOperand(0),
+                      DAG.getTargetConstant(24, SDLoc(N), MVT::i32)}),
+                 0);
+}
+
 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
                                      const AArch64Subtarget *Subtarget) {
   // First try to optimize away the conversion when it's conditionally from
@@ -17847,6 +17914,9 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
   if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
     return Res;
 
+  if (SDValue Res = performVectorIntToFpCombine(N, DAG))
+    return Res;
+
   EVT VT = N->getValueType(0);
   if (VT != MVT::f32 && VT != MVT::f64)
     return SDValue();
diff --git a/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll b/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll
new file mode 100644
index 0000000000000..dbf15ab9936f6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll
@@ -0,0 +1,196 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+; CHECK-LABEL: .LCPI0_0:
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    4
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    5
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    6
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    7
+; CHECK-NEXT:  .LCPI0_1:
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    0
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    1
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    2
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    255
+; CHECK-NEXT:  .byte    255
+
+define void @sitofp_v8i8_to_v8f32(ptr %src, ptr %dst) {
+; CHECK-LABEL: sitofp_v8i8_to_v8f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, .LCPI0_0
+; CHECK-NEXT:    adrp x9, .LCPI0_1
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI0_1]
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:  .LBB0_1: // %loop
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr d2, [x0, x8, lsl #3]
+; CHECK-NEXT:    add x9, x1, x8, lsl #5
+; CHECK-NEXT:    add x8, x8, #1
+; CHECK-NEXT:    cmp x8, #1000
+; CHECK-NEXT:    tbl v3.16b, { v2.16b }, v0.16b
+; CHECK-NEXT:    tbl v2.16b, { v2.16b }, v1.16b
+; CHECK-NEXT:    scvtf v3.4s, v3.4s, #24
+; CHECK-NEXT:    scvtf v2.4s, v2.4s, #24
+; CHECK-NEXT:    stp q2, q3, [x9]
+; CHECK-NEXT:    b.eq .LBB0_1
+; CHECK-NEXT:  // %bb.2: // %exit
+; CHECK-NEXT:    ret
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src = getelementptr inbounds <8 x i8>, ptr %src, i64 %iv
+  %l = load <8 x i8>, ptr %gep.src
+  %conv = sitofp <8 x i8> %l to <8 x float>
+  %gep.dst = getelementptr inbounds <8 x float>, ptr %dst, i64 %iv
+  store <8 x float> %conv, ptr %gep.dst
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: .LCPI1_0:
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    12
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    13
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    14
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    15 
+; CHECK-NEXT: .LCPI1_1:
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    8
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    9
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    10
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    11
+; CHECK-NEXT: .LCPI1_2:
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    4
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    5
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    6
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    7
+; CHECK-NEXT: .LCPI1_3:
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    0
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    1
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    2
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    255
+; CHECK-NEXT: .byte    3
+
+define void @sitofp_v16i8_to_v16f32(ptr %src, ptr %dst) {
+; CHECK-LABEL: sitofp_v16i8_to_v16f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, .LCPI1_0
+; CHECK-NEXT:    adrp x9, .LCPI1_1
+; CHECK-NEXT:    adrp x10, .LCPI1_2
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT:    adrp x8, .LCPI1_3
+; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI1_1]
+; CHECK-NEXT:    ldr q2, [x10, :lo12:.LCPI1_2]
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI1_3]
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:  .LBB1_1: // %loop
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr q4, [x0, x8, lsl #4]
+; CHECK-NEXT:    add x9, x1, x8, lsl #6
+; CHECK-NEXT:    add x8, x8, #1
+; CHECK-NEXT:    cmp x8, #1000
+; CHECK-NEXT:    tbl v5.16b, { v4.16b }, v0.16b
+; CHECK-NEXT:    tbl v6.16b, { v4.16b }, v1.16b
+; CHECK-NEXT:    tbl v7.16b, { v4.16b }, v2.16b
+; CHECK-NEXT:    tbl v4.16b, { v4.16b }, v3.16b
+; CHECK-NEXT:    scvtf v5.4s, v5.4s, #24
+; CHECK-NEXT:    scvtf v6.4s, v6.4s, #24
+; CHECK-NEXT:    scvtf v7.4s, v7.4s, #24
+; CHECK-NEXT:    scvtf v4.4s, v4.4s, #24
+; CHECK-NEXT:    stp q6, q5, [x9, #32]
+; CHECK-NEXT:    stp q4, q7, [x9]
+; CHECK-NEXT:    b.eq .LBB1_1
+; CHECK-NEXT:  // %bb.2: // %exit
+; CHECK-NEXT:    ret
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src = getelementptr inbounds <16 x i8>, ptr %src, i64 %iv
+  %l = load <16 x i8>, ptr %gep.src
+  %conv = sitofp <16 x i8> %l to <16 x float>
+  %gep.dst = getelementptr inbounds <16 x float>, ptr %dst, i64 %iv
+  store <16 x float> %conv, ptr %gep.dst
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

momchil-velikov · 2024-06-07T13:46:00Z

Ping?

davemgreen

Considering this operates on sitofp I think it should be OK. LGTM

llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll

... in preparation for #92528

In a similar manner as in https://reviews.llvm.org/D133494 use `TBL` to place bytes in the *upper* part of `i32` elements and then convert to float using fixed-point `scvtf`, i.e. scvtf Vd.4s, Vn.4s, llvm#24 Change-Id: Ib9df3e4243612cbee8560907b24b14e76b61f265

scweng · 2024-06-26T16:55:18Z

Hi, at Google we are currently seeing two test failures after this commit:

In Jax, lax_numpy_test has some floating point number differences like -0.75 vs -0.6875 and 0.5 vs 0.6875.
In an internal test case for Eigen3, -4.22987 vs -5.

We are still trying to reduce the Eigen3 test, which is pure C++. I know that this change was landed over a week ago, but with the two differences both point to this commit, is it possible to roll it back until further investigation?

[edit]

Actually, the Eigen failure is not internal -- the test infra is, but the test that fails is https://gitlab.com/libeigen/eigen/-/blob/master/test/array_cwise.cpp?ref_type=heads

This reverts commit d1a4f0c. There are reports about test failures with Eigen and JAX.

scweng · 2024-06-27T01:38:00Z

Thank you! I'm working on reducing the case.

scweng · 2024-06-28T19:09:41Z

The .ii file is still over 500k bytes long because eigen is a header library. But I've seen the assembly diff boils down to two snippets of exactly this. Before this commit

          fmov    s0, w21
          mov     v0.s[1], w22
          shl     v0.2s, v0.2s, #24
          sshr    v0.2s, v0.2s, #24
          scvtf   v0.2s, v0.2s
          fcvtzs  w8, s0
          str     q0, [sp]                        // 16-byte Folded Spill
          cmp     w8, w21, sxtb
          b.ne    .LBB16_10
  // %bb.2:                               // %if.end
                                          //   in Loop: Header=BB16_1 Depth=1
          // ... function call removed ...
          ldr     q0, [sp]                        // 16-byte Folded Reload
          mov     s0, v0.s[1]
          fcvtzs  w8, s0
          cmp     w8, w22, sxtb
          b.ne    .LBB16_9

At this commit:

          mov     v0.b[3], w21
          mov     v0.b[7], w22
          scvtf   v0.2s, v0.2s, #24
          fcvtzs  w8, s0
          str     q0, [sp]                        // 16-byte Folded Spill
          cmp     w8, w21, sxtb
          b.ne    .LBB16_10
  // %bb.2:                               // %if.end
                                          //   in Loop: Header=BB16_1 Depth=1
          // ... function call removed ...
          ldr     q0, [sp]                        // 16-byte Folded Reload
          mov     s0, v0.s[1]
          fcvtzs  w8, s0
          cmp     w8, w22, sxtb
          b.ne    .LBB16_9

The other part is identical except for the input registers (w20 and w19 except w21 and w22). Unified diff below:

--- array_cwise_no_sitofp_tbl.s 2024-06-29 02:48:01.129830833 +0800
+++ array_cwise_sitofp_tbl.s    2024-06-29 02:50:15.145362022 +0800
@@ -1296,11 +1296,9 @@
        bl      rand
        mov     w19, w0
        bl      rand
-       fmov    s0, w21
-       mov     v0.s[1], w22
-       shl     v0.2s, v0.2s, #24
-       sshr    v0.2s, v0.2s, #24
-       scvtf   v0.2s, v0.2s
+       mov     v0.b[3], w21
+       mov     v0.b[7], w22
+       scvtf   v0.2s, v0.2s, #24
        fcvtzs  w8, s0
        str     q0, [sp]                        // 16-byte Folded Spill
        cmp     w8, w21, sxtb
@@ -1317,13 +1315,11 @@
        b.ne    .LBB16_9
 // %bb.3:                               // %if.end.1
                                         //   in Loop: Header=BB16_1 Depth=1
-       fmov    s0, w20
+       mov     v0.b[3], w20
        ldr     x8, [x24, :lo12:.L_MergedGlobals+8]
        sub     x0, x8, #32
-       mov     v0.s[1], w19
-       shl     v0.2s, v0.2s, #24
-       sshr    v0.2s, v0.2s, #24
-       scvtf   v0.2s, v0.2s
+       mov     v0.b[7], w19
+       scvtf   v0.2s, v0.2s, #24
        str     q0, [sp]                        // 16-byte Folded Spill
        bl      _ZNKSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE5c_strEv
        ldr     q0, [sp]                        // 16-byte Folded Reload

I'm not sure how the two snippets behave differently. And if they do behave differently, it feels like a latent bug uncovered by this commit?

scweng · 2024-06-28T19:17:56Z

Oh, "MOV (from general)" used in the new assembly sequences does not clear other bits, while the shift left then right method does, doesn't it? That would explain that the differences seem to be precision loss.

scweng · 2024-06-28T21:17:25Z

I'm pretty confident that this is a pre-existing bug somewhere in instruction selection that was revealed by this commit.

-print-before-all at this commit, %0 through %3 are all return values of i32 @rand() trunc to i8:

*** IR Dump Before CodeGen Prepare (codegenprepare) ***
; Function Attrs: mustprogress uwtable
define linkonce_odr dso_local void @_ZN14cast_test_implIafLi5ELi1EE3runEv() local_unnamed_addr #6 comdat personality ptr @__gxx_personality_v0 {
...
  %4 = insertelement <2 x i8> poison, i8 %0, i64 0
  %5 = insertelement <2 x i8> %4, i8 %1, i64 1
  %shuffle.i.i2.i.i.i.i.i.i.i.i.i.i.i.i = sitofp <2 x i8> %5 to <2 x float>
  %6 = insertelement <2 x i8> poison, i8 %2, i64 0
  %7 = insertelement <2 x i8> %6, i8 %3, i64 1
  %shuffle.i.i2.i.i.i.i.i.i.i.i.i.i.i.i.i = sitofp <2 x i8> %7 to <2 x float>
  %dst.sroa.0.0.vec.extract = extractelement <2 x float> %shuffle.i.i2.i.i.i.i.i.i.i.i.i.i.i.i, i64 0
  %conv.i97 = fptosi float %dst.sroa.0.0.vec.extract to i32
  %sext = shl i32 %call.i.i.i.i.i.i.i.i.i.i.i.i.i.i.i.i.i, 24
  %conv1.i = ashr exact i32 %sext, 24
  %cmp.i.i.i.i = icmp eq i32 %conv1.i, %conv.i97
  br i1 %cmp.i.i.i.i, label %if.end, label %if.then
...

*** IR Dump Before Exception handling preparation (dwarf-eh-prepare) ***
; Function Attrs: mustprogress uwtable
define linkonce_odr dso_local void @_ZN14cast_test_implIafLi5ELi1EE3runEv() local_unnamed_addr #6 comdat personality ptr @__gxx_personality_v0 {
...
  %4 = insertelement <2 x i8> poison, i8 %0, i64 0
  %5 = insertelement <2 x i8> %4, i8 %1, i64 1
  %6 = shufflevector <2 x i8> %5, <2 x i8> <i8 0, i8 poison>, <8 x i32> <i32 2, i32 2, i32 2, i32 0, i32 2, i32 2, i32 2, i32 1>
  %7 = bitcast <8 x i8> %6 to <2 x i32>
  %8 = ashr exact <2 x i32> %7, <i32 24, i32 24>
  %9 = sitofp <2 x i32> %8 to <2 x float>
  %10 = insertelement <2 x i8> poison, i8 %2, i64 0
  %11 = insertelement <2 x i8> %10, i8 %3, i64 1
  %12 = shufflevector <2 x i8> %11, <2 x i8> <i8 0, i8 poison>, <8 x i32> <i32 2, i32 2, i32 2, i32 0, i32 2, i32 2, i32 2, i32 1>
  %13 = bitcast <8 x i8> %12 to <2 x i32>
  %14 = ashr exact <2 x i32> %13, <i32 24, i32 24>
  %15 = sitofp <2 x i32> %14 to <2 x float>
  %dst.sroa.0.0.vec.extract = extractelement <2 x float> %9, i64 0
  %conv.i97 = fptosi float %dst.sroa.0.0.vec.extract to i32
  %sext = shl i32 %call.i.i.i.i.i.i.i.i.i.i.i.i.i.i.i.i.i, 24
  %conv1.i = ashr exact i32 %sext, 24
  %cmp.i.i.i.i = icmp eq i32 %conv1.i, %conv.i97
  br i1 %cmp.i.i.i.i, label %if.end, label %if.then

The instruction after this commit, %6 = shufflevector <2 x i8> %5, <2 x i8> <i8 0, i8 poison>, <8 x i32> <i32 2, i32 2, i32 2, i32 0, i32 2, i32 2, i32 2, i32 1> clearly requests v0.[0, 1, 2, 4, 5, 6] to be i8 0, but mov v0.b[3], w21; mov v0.b[7], w22 fails to do so.

Here's the repro: https://godbolt.org/z/686fbPjKz

define dso_local noundef <2 x float> @before(i8 noundef %0, i8 noundef %1) {
  %3 = insertelement <2 x i8> poison, i8 %0, i64 0
  %4 = insertelement <2 x i8> %3, i8 %1, i64 1
  %5 = sitofp <2 x i8> %4 to <2 x float>
  ret <2 x float> %5
}

define dso_local noundef <2 x float> @after(i8 noundef %0, i8 noundef %1) {
  %3 = insertelement <2 x i8> poison, i8 %0, i64 0
  %4 = insertelement <2 x i8> %3, i8 %1, i64 1
  %5 = shufflevector <2 x i8> %4, <2 x i8> <i8 0, i8 poison>, <8 x i32> <i32 2, i32 2, i32 2, i32 0, i32 2, i32 2, i32 2, i32 1>
  %6 = bitcast <8 x i8> %5 to <2 x i32>
  %7 = ashr exact <2 x i32> %6, <i32 24, i32 24>
  %8 = sitofp <2 x i32> %7 to <2 x float>
  ret <2 x float> %8
}

results in

before:
        fmov    s0, w0
        mov     v0.s[1], w1
        shl     v0.2s, v0.2s, #24
        sshr    v0.2s, v0.2s, #24
        scvtf   v0.2s, v0.2s
        ret
after:
        mov     v0.b[3], w0
        mov     v0.b[7], w1
        scvtf   v0.2s, v0.2s, #24
        ret

momchil-velikov · 2024-07-01T10:28:00Z

Thank you very much, that's incredibly helpful !

This reverts commit d1a4f0c. There are reports about test failures with Eigen and JAX.

When building a vector which contains zero elements, the AArch64 ISel replaces those elements with `undef`, if they are right shifted out. However, these elements need to stay zero if the right shift is exact, or otherwise we will be introducing undefined behavior. Should allow #92528 to be recommitted.

momchil-velikov · 2024-07-05T15:04:42Z

Should be OK to re-commit after #97448
I ran Eigen tests, they look good (well, I get random distinct 0-3 failures on each run, which happens on main too).
Will re-commit next week, unless somebody spots a problem.

When building a vector which contains zero elements, the AArch64 ISel replaces those elements with `undef`, if they are right shifted out. However, these elements need to stay zero if the right shift is exact, or otherwise we will be introducing undefined behavior. Should allow llvm#92528 to be recommitted.

This re-commits d1a4f0c after a issue was fixed in f92bfca ("[AArch64] All bits of an exact right shift are demanded (#97448)").

This reverts commit d1a4f0c. There are reports about test failures with Eigen and JAX.

llvmbot added the backend:AArch64 label May 17, 2024

momchil-velikov mentioned this pull request May 17, 2024

[AArch64] Refactor creation of a shuffle mask for TBL (NFC) #92529

Merged

momchil-velikov requested review from fhahn, hassnaaHamdi, davemgreen, paulwalker-arm, Lukacma and CarolineConcatto May 17, 2024 12:03

davemgreen reviewed May 20, 2024

View reviewed changes

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp Outdated Show resolved Hide resolved

momchil-velikov force-pushed the sitofp branch 2 times, most recently from e6cfa82 to 550b838 Compare May 28, 2024 13:48

momchil-velikov force-pushed the sitofp branch from 550b838 to e8e2ab0 Compare May 31, 2024 10:39

davemgreen approved these changes Jun 11, 2024

View reviewed changes

llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll Show resolved Hide resolved

momchil-velikov force-pushed the sitofp branch from e8e2ab0 to 8936eaf Compare June 12, 2024 15:28

fhahn reviewed Jun 13, 2024

View reviewed changes

llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll Show resolved Hide resolved

momchil-velikov force-pushed the sitofp branch 2 times, most recently from b17f3b6 to fec1355 Compare June 17, 2024 11:12

momchil-velikov added a commit that referenced this pull request Jun 17, 2024

[AArch64] Refactor creation of a shuffle mask for TBL (NFC) (#92529)

96e8d0f

... in preparation for #92528

momchil-velikov added 3 commits June 17, 2024 15:02

fixup: add a missing CHECK line

66315c1

[fixup] Add some more tests

904576f

momchil-velikov force-pushed the sitofp branch from fec1355 to 904576f Compare June 17, 2024 14:32

momchil-velikov merged commit d1a4f0c into llvm:main Jun 17, 2024
5 of 7 checks passed

momchil-velikov added a commit that referenced this pull request Jun 26, 2024

Revert "[AArch64] Lower extending sitofp using tbl (#92528)"

d058b51

This reverts commit d1a4f0c. There are reports about test failures with Eigen and JAX.

momchil-velikov mentioned this pull request Jul 2, 2024

[AArch64] All bits of an exact right shift are demanded #97448

Merged

lravenclaw pushed a commit to lravenclaw/llvm-project that referenced this pull request Jul 3, 2024

Revert "[AArch64] Lower extending sitofp using tbl (llvm#92528)"

e6f6d76

This reverts commit d1a4f0c. There are reports about test failures with Eigen and JAX.

momchil-velikov added a commit that referenced this pull request Jul 8, 2024

Reapply "[AArch64] Lower extending sitofp using tbl (#92528)"

a497e98

This re-commits d1a4f0c after a issue was fixed in f92bfca ("[AArch64] All bits of an exact right shift are demanded (#97448)").

AlexisPerry pushed a commit to llvm-project-tlp/llvm-project that referenced this pull request Jul 9, 2024

Revert "[AArch64] Lower extending sitofp using tbl (llvm#92528)"

e537ed5

This reverts commit d1a4f0c. There are reports about test failures with Eigen and JAX.

momchil-velikov deleted the sitofp branch November 13, 2024 09:33

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AArch64] Lower extending sitofp using tbl #92528

[AArch64] Lower extending sitofp using tbl #92528

momchil-velikov commented May 17, 2024

llvmbot commented May 17, 2024

momchil-velikov commented Jun 7, 2024

davemgreen left a comment

scweng commented Jun 26, 2024 •

edited

Loading

scweng commented Jun 27, 2024

scweng commented Jun 28, 2024

scweng commented Jun 28, 2024

scweng commented Jun 28, 2024

momchil-velikov commented Jul 1, 2024

momchil-velikov commented Jul 5, 2024

[AArch64] Lower extending sitofp using tbl #92528

[AArch64] Lower extending sitofp using tbl #92528

Conversation

momchil-velikov commented May 17, 2024

llvmbot commented May 17, 2024

momchil-velikov commented Jun 7, 2024

davemgreen left a comment

Choose a reason for hiding this comment

scweng commented Jun 26, 2024 • edited Loading

scweng commented Jun 27, 2024

scweng commented Jun 28, 2024

scweng commented Jun 28, 2024

scweng commented Jun 28, 2024

momchil-velikov commented Jul 1, 2024

momchil-velikov commented Jul 5, 2024

scweng commented Jun 26, 2024 •

edited

Loading