[Transforms][SLP] Added bytecast support

This commit adds bytecast support for SLP vectorizer. Before, a sequence of bytecasts would not be possible to combine in a single SIMF bundle - which led to poor vectorization if vectorization at all. Now, bytecasts can be also combine together. Currently, we model the cost of vectorizing a bytecast as 0 - since a byte to integer conversion is nothing else but a no-op. However, for some cases (e.g. to pointer conversion) we may want to have something better. Note: The cost of a bytecast is encoded directly in SLPVectorize. The main reason for this is that byttecast is not a member of ISD, and hence cannot be used in TLI. Moreover, this commit adds a pattern match for bytecasts. It is needed for load combining, where we look for ``` zext(bytecast(load)) ``` sequence.
georgemitenkov · Aug 18, 2021 · 165cc34 · 165cc34
1 parent c1a3c4d
commit 165cc34
Show file tree

Hide file tree

Showing 4 changed files with 301 additions and 1 deletion.
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
@@ -1604,6 +1604,12 @@ inline CastClass_match<OpTy, Instruction::BitCast> m_BitCast(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::BitCast>(Op);
 }
 
+/// Matches ByteCast.
+template <typename OpTy>
+inline CastClass_match<OpTy, Instruction::ByteCast> m_ByteCast(const OpTy &Op) {
+  return CastClass_match<OpTy, Instruction::ByteCast>(Op);
+}
+
 /// Matches PtrToInt.
 template <typename OpTy>
 inline CastClass_match<OpTy, Instruction::PtrToInt> m_PtrToInt(const OpTy &Op) {

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3419,6 +3419,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     case Instruction::UIToFP:
     case Instruction::Trunc:
     case Instruction::FPTrunc:
+    case Instruction::ByteCast:
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
       for (Value *V : VL) {
@@ -4397,6 +4398,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
 
       return Cost;
     }
+    case Instruction::ByteCast:
+      // Currently, treat bytecasts as a no-op.
+      return 0;
     case Instruction::ZExt:
     case Instruction::SExt:
     case Instruction::FPToUI:
@@ -4754,7 +4758,8 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
   // Check if the input is an extended load of the required or/shift expression.
   Value *LoadPtr;
   if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
-      !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+      !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))) ||
+      !match(ZextLoad, m_ZExt(m_ByteCast(m_Load(m_Value(LoadPtr))))))
     return false;
 
   // Require that the total load bit width is a legal integer type.
@@ -5754,6 +5759,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     case Instruction::UIToFP:
     case Instruction::Trunc:
     case Instruction::FPTrunc:
+    case Instruction::ByteCast:
     case Instruction::BitCast: {
       setInsertPointAfterBundle(E);
 

diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/widen-bytecast.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/widen-bytecast.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+; This should not be matched as a load combining candidate.
+; There are no 'or' operations, so it can't be a bswap or
+; other pattern that we are expecting the backend to handle.
+
+define void @PR50256_with_byte(b8* %a, i16* %c, i32 %n) {
+; CHECK-LABEL: @PR50256_with_byte(
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds b8, b8* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds b8, b8* [[A]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds b8, b8* [[A]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds b8, b8* [[A]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds b8, b8* [[A]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds b8, b8* [[A]], i64 7
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds b8, b8* [[A]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds b8, b8* [[A]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds b8, b8* [[A]], i64 9
+; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds b8, b8* [[A]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds b8, b8* [[A]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds b8, b8* [[A]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds b8, b8* [[A]], i64 13
+; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds b8, b8* [[A]], i64 14
+; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds b8, b8* [[A]], i64 15
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast b8* [[A]] to <8 x b8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x b8>, <8 x b8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast b8* [[ARRAYIDX_8]] to <8 x b8>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x b8>, <8 x b8>* [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bytecast <8 x b8> [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bytecast <8 x b8> [[TMP4]] to <8 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = zext <8 x i8> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[TMP6]] to <8 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw <8 x i16> [[TMP7]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw <8 x i16> [[TMP8]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i16, i16* [[C:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i16, i16* [[C]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i16, i16* [[C]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds i16, i16* [[C]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds i16, i16* [[C]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds i16, i16* [[C]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX3_7:%.*]] = getelementptr inbounds i16, i16* [[C]], i64 7
+; CHECK-NEXT:    [[ARRAYIDX3_8:%.*]] = getelementptr inbounds i16, i16* [[C]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX3_9:%.*]] = getelementptr inbounds i16, i16* [[C]], i64 9
+; CHECK-NEXT:    [[ARRAYIDX3_10:%.*]] = getelementptr inbounds i16, i16* [[C]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX3_11:%.*]] = getelementptr inbounds i16, i16* [[C]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX3_12:%.*]] = getelementptr inbounds i16, i16* [[C]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX3_13:%.*]] = getelementptr inbounds i16, i16* [[C]], i64 13
+; CHECK-NEXT:    [[ARRAYIDX3_14:%.*]] = getelementptr inbounds i16, i16* [[C]], i64 14
+; CHECK-NEXT:    [[ARRAYIDX3_15:%.*]] = getelementptr inbounds i16, i16* [[C]], i64 15
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16* [[C]] to <8 x i16>*
+; CHECK-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* [[TMP11]], align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16* [[ARRAYIDX3_8]] to <8 x i16>*
+; CHECK-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* [[TMP12]], align 2
+; CHECK-NEXT:    ret void
+;
+  %arrayidx.1 = getelementptr inbounds b8, b8* %a, i64 1
+  %arrayidx.2 = getelementptr inbounds b8, b8* %a, i64 2
+  %arrayidx.3 = getelementptr inbounds b8, b8* %a, i64 3
+  %arrayidx.4 = getelementptr inbounds b8, b8* %a, i64 4
+  %arrayidx.5 = getelementptr inbounds b8, b8* %a, i64 5
+  %arrayidx.7 = getelementptr inbounds b8, b8* %a, i64 7
+  %arrayidx.6 = getelementptr inbounds b8, b8* %a, i64 6
+  %arrayidx.8 = getelementptr inbounds b8, b8* %a, i64 8
+  %arrayidx.9 = getelementptr inbounds b8, b8* %a, i64 9
+  %arrayidx.10 = getelementptr inbounds b8, b8* %a, i64 10
+  %arrayidx.11 = getelementptr inbounds b8, b8* %a, i64 11
+  %arrayidx.12 = getelementptr inbounds b8, b8* %a, i64 12
+  %arrayidx.13 = getelementptr inbounds b8, b8* %a, i64 13
+  %arrayidx.14 = getelementptr inbounds b8, b8* %a, i64 14
+  %arrayidx.15 = getelementptr inbounds b8, b8* %a, i64 15
+  %b = load b8, b8* %a, align 1
+  %b1 = load b8, b8* %arrayidx.1, align 1
+  %b2 = load b8, b8* %arrayidx.2, align 1
+  %b3 = load b8, b8* %arrayidx.3, align 1
+  %b4 = load b8, b8* %arrayidx.4, align 1
+  %b5 = load b8, b8* %arrayidx.5, align 1
+  %b6 = load b8, b8* %arrayidx.6, align 1
+  %b7 = load b8, b8* %arrayidx.7, align 1
+  %b8 = load b8, b8* %arrayidx.8, align 1
+  %b9 = load b8, b8* %arrayidx.9, align 1
+  %b10 = load b8, b8* %arrayidx.10, align 1
+  %b11 = load b8, b8* %arrayidx.11, align 1
+  %b12 = load b8, b8* %arrayidx.12, align 1
+  %b13 = load b8, b8* %arrayidx.13, align 1
+  %b14 = load b8, b8* %arrayidx.14, align 1
+  %b15 = load b8, b8* %arrayidx.15, align 1
+  %i = bytecast b8 %b to i8
+  %i1 = bytecast b8 %b1 to i8
+  %i2 = bytecast b8 %b2 to i8
+  %i3 = bytecast b8 %b3 to i8
+  %i4 = bytecast b8 %b4 to i8
+  %i5 = bytecast b8 %b5 to i8
+  %i6 = bytecast b8 %b6 to i8
+  %i7 = bytecast b8 %b7 to i8
+  %i8 = bytecast b8 %b8 to i8
+  %i9 = bytecast b8 %b9 to i8
+  %i10 = bytecast b8 %b10 to i8
+  %i11 = bytecast b8 %b11 to i8
+  %i12 = bytecast b8 %b12 to i8
+  %i13 = bytecast b8 %b13 to i8
+  %i14 = bytecast b8 %b14 to i8
+  %i15 = bytecast b8 %b15 to i8
+  %conv5 = zext i8 %i to i16
+  %conv5.1 = zext i8 %i1 to i16
+  %conv5.2 = zext i8 %i2 to i16
+  %conv5.3 = zext i8 %i3 to i16
+  %conv5.4 = zext i8 %i4 to i16
+  %conv5.5 = zext i8 %i5 to i16
+  %conv5.6 = zext i8 %i6 to i16
+  %conv5.7 = zext i8 %i7 to i16
+  %conv5.8 = zext i8 %i8 to i16
+  %conv5.9 = zext i8 %i9 to i16
+  %conv5.10 = zext i8 %i10 to i16
+  %conv5.11 = zext i8 %i11 to i16
+  %conv5.12 = zext i8 %i12 to i16
+  %conv5.13 = zext i8 %i13 to i16
+  %conv5.14 = zext i8 %i14 to i16
+  %conv5.15 = zext i8 %i15 to i16
+  %shl = shl nuw i16 %conv5, 8
+  %shl.1 = shl nuw i16 %conv5.1, 8
+  %shl.2 = shl nuw i16 %conv5.2, 8
+  %shl.3 = shl nuw i16 %conv5.3, 8
+  %shl.4 = shl nuw i16 %conv5.4, 8
+  %shl.5 = shl nuw i16 %conv5.5, 8
+  %shl.6 = shl nuw i16 %conv5.6, 8
+  %shl.7 = shl nuw i16 %conv5.7, 8
+  %shl.8 = shl nuw i16 %conv5.8, 8
+  %shl.9 = shl nuw i16 %conv5.9, 8
+  %shl.10 = shl nuw i16 %conv5.10, 8
+  %shl.11 = shl nuw i16 %conv5.11, 8
+  %shl.12 = shl nuw i16 %conv5.12, 8
+  %shl.13 = shl nuw i16 %conv5.13, 8
+  %shl.14 = shl nuw i16 %conv5.14, 8
+  %shl.15 = shl nuw i16 %conv5.15, 8
+  %arrayidx3.1 = getelementptr inbounds i16, i16* %c, i64 1
+  %arrayidx3.2 = getelementptr inbounds i16, i16* %c, i64 2
+  %arrayidx3.3 = getelementptr inbounds i16, i16* %c, i64 3
+  %arrayidx3.4 = getelementptr inbounds i16, i16* %c, i64 4
+  %arrayidx3.5 = getelementptr inbounds i16, i16* %c, i64 5
+  %arrayidx3.6 = getelementptr inbounds i16, i16* %c, i64 6
+  %arrayidx3.7 = getelementptr inbounds i16, i16* %c, i64 7
+  %arrayidx3.8 = getelementptr inbounds i16, i16* %c, i64 8
+  %arrayidx3.9 = getelementptr inbounds i16, i16* %c, i64 9
+  %arrayidx3.10 = getelementptr inbounds i16, i16* %c, i64 10
+  %arrayidx3.11 = getelementptr inbounds i16, i16* %c, i64 11
+  %arrayidx3.12 = getelementptr inbounds i16, i16* %c, i64 12
+  %arrayidx3.13 = getelementptr inbounds i16, i16* %c, i64 13
+  %arrayidx3.14 = getelementptr inbounds i16, i16* %c, i64 14
+  %arrayidx3.15 = getelementptr inbounds i16, i16* %c, i64 15
+  store i16 %shl, i16* %c, align 2
+  store i16 %shl.1, i16* %arrayidx3.1, align 2
+  store i16 %shl.2, i16* %arrayidx3.2, align 2
+  store i16 %shl.3, i16* %arrayidx3.3, align 2
+  store i16 %shl.4, i16* %arrayidx3.4, align 2
+  store i16 %shl.5, i16* %arrayidx3.5, align 2
+  store i16 %shl.6, i16* %arrayidx3.6, align 2
+  store i16 %shl.7, i16* %arrayidx3.7, align 2
+  store i16 %shl.8, i16* %arrayidx3.8, align 2
+  store i16 %shl.9, i16* %arrayidx3.9, align 2
+  store i16 %shl.10, i16* %arrayidx3.10, align 2
+  store i16 %shl.11, i16* %arrayidx3.11, align 2
+  store i16 %shl.12, i16* %arrayidx3.12, align 2
+  store i16 %shl.13, i16* %arrayidx3.13, align 2
+  store i16 %shl.14, i16* %arrayidx3.14, align 2
+  store i16 %shl.15, i16* %arrayidx3.15, align 2
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bytecast.ll b/llvm/test/Transforms/SLPVectorizer/X86/bytecast.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
+
+define <2 x i64> @load_bytecast_ext(b8* %p0) {
+; SSE2-LABEL: @load_bytecast_ext(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds b8, b8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast b8* [[P0]] to <2 x b8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x b8>, <2 x b8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = bytecast <2 x b8> [[TMP2]] to <2 x i8>
+; SSE2-NEXT:    [[TMP4:%.*]] = zext <2 x i8> [[TMP3]] to <2 x i64>
+; SSE2-NEXT:    ret <2 x i64> [[TMP4]]
+;
+; SLM-LABEL: @load_bytecast_ext(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds b8, b8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast b8* [[P0]] to <2 x b8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x b8>, <2 x b8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = bytecast <2 x b8> [[TMP2]] to <2 x i8>
+; SLM-NEXT:    [[TMP4:%.*]] = zext <2 x i8> [[TMP3]] to <2 x i64>
+; SLM-NEXT:    ret <2 x i64> [[TMP4]]
+;
+; AVX-LABEL: @load_bytecast_ext(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds b8, b8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast b8* [[P0]] to <2 x b8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x b8>, <2 x b8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = bytecast <2 x b8> [[TMP2]] to <2 x i8>
+; AVX-NEXT:    [[TMP4:%.*]] = zext <2 x i8> [[TMP3]] to <2 x i64>
+; AVX-NEXT:    ret <2 x i64> [[TMP4]]
+;
+  %p1 = getelementptr inbounds b8, b8* %p0, i64 1
+  %b0 = load b8, b8* %p0, align 1
+  %b1 = load b8, b8* %p1, align 1
+  %i0 = bytecast b8 %b0 to i8
+  %i1 = bytecast b8 %b1 to i8
+  %x0 = zext i8 %i0 to i64
+  %x1 = zext i8 %i1 to i64
+  %v0 = insertelement <2 x i64> poison, i64 %x0, i32 0
+  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
+  ret <2 x i64> %v1
+}
+
+define <8 x i8> @load_bytecast(b8* %p0) {
+; SSE2-LABEL: @load_bytecast(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds b8, b8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds b8, b8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds b8, b8* [[P0]], i64 3
+; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds b8, b8* [[P0]], i64 4
+; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds b8, b8* [[P0]], i64 5
+; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds b8, b8* [[P0]], i64 6
+; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds b8, b8* [[P0]], i64 7
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast b8* [[P0]] to <8 x b8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x b8>, <8 x b8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = bytecast <8 x b8> [[TMP2]] to <8 x i8>
+; SSE2-NEXT:    ret <8 x i8> [[TMP3]]
+;
+; SLM-LABEL: @load_bytecast(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds b8, b8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds b8, b8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds b8, b8* [[P0]], i64 3
+; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds b8, b8* [[P0]], i64 4
+; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds b8, b8* [[P0]], i64 5
+; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds b8, b8* [[P0]], i64 6
+; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds b8, b8* [[P0]], i64 7
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast b8* [[P0]] to <8 x b8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x b8>, <8 x b8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = bytecast <8 x b8> [[TMP2]] to <8 x i8>
+; SLM-NEXT:    ret <8 x i8> [[TMP3]]
+;
+; AVX-LABEL: @load_bytecast(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds b8, b8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds b8, b8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds b8, b8* [[P0]], i64 3
+; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds b8, b8* [[P0]], i64 4
+; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds b8, b8* [[P0]], i64 5
+; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds b8, b8* [[P0]], i64 6
+; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds b8, b8* [[P0]], i64 7
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast b8* [[P0]] to <8 x b8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x b8>, <8 x b8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = bytecast <8 x b8> [[TMP2]] to <8 x i8>
+; AVX-NEXT:    ret <8 x i8> [[TMP3]]
+;
+  %p1 = getelementptr inbounds b8, b8* %p0, i64 1
+  %p2 = getelementptr inbounds b8, b8* %p0, i64 2
+  %p3 = getelementptr inbounds b8, b8* %p0, i64 3
+  %p4 = getelementptr inbounds b8, b8* %p0, i64 4
+  %p5 = getelementptr inbounds b8, b8* %p0, i64 5
+  %p6 = getelementptr inbounds b8, b8* %p0, i64 6
+  %p7 = getelementptr inbounds b8, b8* %p0, i64 7
+  %b0 = load b8, b8* %p0, align 1
+  %b1 = load b8, b8* %p1, align 1
+  %b2 = load b8, b8* %p2, align 1
+  %b3 = load b8, b8* %p3, align 1
+  %b4 = load b8, b8* %p4, align 1
+  %b5 = load b8, b8* %p5, align 1
+  %b6 = load b8, b8* %p6, align 1
+  %b7 = load b8, b8* %p7, align 1
+  %x0 = bytecast b8 %b0 to i8
+  %x1 = bytecast b8 %b1 to i8
+  %x2 = bytecast b8 %b2 to i8
+  %x3 = bytecast b8 %b3 to i8
+  %x4 = bytecast b8 %b4 to i8
+  %x5 = bytecast b8 %b5 to i8
+  %x6 = bytecast b8 %b6 to i8
+  %x7 = bytecast b8 %b7 to i8
+  %v0 = insertelement <8 x i8> poison, i8 %x0, i32 0
+  %v1 = insertelement <8 x i8>   %v0, i8 %x1, i32 1
+  %v2 = insertelement <8 x i8>   %v1, i8 %x2, i32 2
+  %v3 = insertelement <8 x i8>   %v2, i8 %x3, i32 3
+  %v4 = insertelement <8 x i8>   %v3, i8 %x4, i32 4
+  %v5 = insertelement <8 x i8>   %v4, i8 %x5, i32 5
+  %v6 = insertelement <8 x i8>   %v5, i8 %x6, i32 6
+  %v7 = insertelement <8 x i8>   %v6, i8 %x7, i32 7
+  ret <8 x i8> %v7
+}