Skip to content

Commit

Permalink
[MemCpyOpt] Calculate the offset value to forward memcpy
Browse files Browse the repository at this point in the history
  • Loading branch information
DianQK committed Jun 18, 2024
1 parent a538809 commit a29c0b9
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 42 deletions.
85 changes: 61 additions & 24 deletions llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator_range.h"
Expand Down Expand Up @@ -1124,28 +1125,67 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
MemCpyInst *MDep,
BatchAAResults &BAA) {
// We can only transforms memcpy's where the dest of one is the source of the
// other.
if (M->getSource() != MDep->getDest() || MDep->isVolatile())
return false;

// If dep instruction is reading from our current input, then it is a noop
// transfer and substituting the input won't change this instruction. Just
// ignore the input and let someone else zap MDep. This handles cases like:
// transfer and substituting the input won't change this instruction. Just
// ignore the input and let someone else zap MDep. This handles cases like:
// memcpy(a <- a)
// memcpy(b <- a)
if (M->getSource() == MDep->getSource())
return false;

// Second, the length of the memcpy's must be the same, or the preceding one
// We can only optimize non-volatile memcpy's.
if (MDep->isVolatile())
return false;

int64_t MForwardOffset = 0;
const DataLayout &DL = M->getModule()->getDataLayout();
// We can only transforms memcpy's where the dest of one is the source of the
// other, or they have an offset in a range.
if (M->getSource() != MDep->getDest()) {
std::optional<int64_t> Offset =
M->getSource()->getPointerOffsetFrom(MDep->getDest(), DL);
if (!Offset || *Offset < 0)
return false;
MForwardOffset = *Offset;
}

// The length of the memcpy's must be the same, or the preceding one
// must be larger than the following one.
if (MDep->getLength() != M->getLength()) {
if (MForwardOffset != 0 || (MDep->getLength() != M->getLength())) {
auto *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
auto *MLen = dyn_cast<ConstantInt>(M->getLength());
if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
if (!MDepLen || !MLen ||
MDepLen->getZExtValue() < MLen->getZExtValue() + MForwardOffset)
return false;
}

IRBuilder<> Builder(M);
auto *CopySource = MDep->getRawSource();
auto CleanupOnFailure = llvm::make_scope_exit([&CopySource] {
if (CopySource->use_empty())
cast<Instruction>(CopySource)->eraseFromParent();
});
MaybeAlign CopySourceAlign = MDep->getSourceAlign();
// We just need to calculate the actual size of the copy.
auto MCopyLoc = MemoryLocation::getForSource(MDep).getWithNewSize(
MemoryLocation::getForSource(M).Size);

// We need to update `MCopyLoc` if an offset exists.
if (MForwardOffset > 0) {
// The copy destination of `M` maybe can serve as the source of copying.
std::optional<int64_t> MDestOffset =
M->getRawDest()->getPointerOffsetFrom(MDep->getRawSource(), DL);
if (MDestOffset && *MDestOffset == MForwardOffset)
CopySource = M->getRawDest();
else
CopySource = Builder.CreateInBoundsPtrAdd(
CopySource, ConstantInt::get(Type::getInt64Ty(Builder.getContext()),
MForwardOffset));
MCopyLoc = MCopyLoc.getWithNewPtr(CopySource);
if (CopySourceAlign)
CopySourceAlign = commonAlignment(*CopySourceAlign, MForwardOffset);
}

// Verify that the copied-from memory doesn't change in between the two
// transfers. For example, in:
// memcpy(a <- b)
Expand All @@ -1155,10 +1195,8 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
//
// TODO: If the code between M and MDep is transparent to the destination "c",
// then we could still perform the xform by moving M up to the first memcpy.
// TODO: It would be sufficient to check the MDep source up to the memcpy
// size of M, rather than MDep.
if (writtenBetween(MSSA, BAA, MemoryLocation::getForSource(MDep),
MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(M)))
if (writtenBetween(MSSA, BAA, MCopyLoc, MSSA->getMemoryAccess(MDep),
MSSA->getMemoryAccess(M)))
return false;

// If the dest of the second might alias the source of the first, then the
Expand All @@ -1183,23 +1221,22 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,

// TODO: Is this worth it if we're creating a less aligned memcpy? For
// example we could be moving from movaps -> movq on x86.
IRBuilder<> Builder(M);
Instruction *NewM;
if (UseMemMove)
NewM = Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(),
MDep->getRawSource(), MDep->getSourceAlign(),
M->getLength(), M->isVolatile());
NewM =
Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(), CopySource,
CopySourceAlign, M->getLength(), M->isVolatile());
else if (isa<MemCpyInlineInst>(M)) {
// llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is
// never allowed since that would allow the latter to be lowered as a call
// to an external function.
NewM = Builder.CreateMemCpyInline(
M->getRawDest(), M->getDestAlign(), MDep->getRawSource(),
MDep->getSourceAlign(), M->getLength(), M->isVolatile());
NewM = Builder.CreateMemCpyInline(M->getRawDest(), M->getDestAlign(),
CopySource, CopySourceAlign,
M->getLength(), M->isVolatile());
} else
NewM = Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(),
MDep->getRawSource(), MDep->getSourceAlign(),
M->getLength(), M->isVolatile());
NewM =
Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(), CopySource,
CopySourceAlign, M->getLength(), M->isVolatile());
NewM->copyMetadata(*M, LLVMContext::MD_DIAssignID);

assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M)));
Expand Down
17 changes: 10 additions & 7 deletions llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ define void @forward_offset(ptr %dep_src) {
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
; CHECK-NEXT: [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[SRC]], i64 6, i1 false)
; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[DEP]], i64 6, i1 false)
; CHECK-NEXT: ret void
;
%dep_dest = alloca %buf, align 1
Expand All @@ -30,7 +30,7 @@ define void @forward_offset_align(ptr %dep_src) {
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 4 [[DEP_SRC]], i64 9, i1 false)
; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 3
; CHECK-NEXT: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 3
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[SRC]], i64 5, i1 false)
; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[DEST]], i64 5, i1 false)
; CHECK-NEXT: ret void
;
%dep_dest = alloca %buf, align 1
Expand All @@ -49,7 +49,7 @@ define void @forward_offset_align_2(ptr %dep_src) {
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 4 [[DEP_SRC]], i64 9, i1 false)
; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 2
; CHECK-NEXT: [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 2
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[SRC]], i64 6, i1 false)
; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP]], ptr align 2 [[DEP]], i64 6, i1 false)
; CHECK-NEXT: ret void
;
%dep_dest = alloca %buf, align 1
Expand All @@ -68,7 +68,8 @@ define void @forward_offset_with_gep(ptr %dep_src) {
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
; CHECK-NEXT: [[DEP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 2
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP1]], ptr align 1 [[SRC]], i64 6, i1 false)
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP1]], ptr align 1 [[TMP1]], i64 6, i1 false)
; CHECK-NEXT: ret void
;
%dep_dest = alloca %buf, align 1
Expand All @@ -87,7 +88,8 @@ define void @forward_offset_memcpy(ptr %dep_src) {
; CHECK-NEXT: [[DEST:%.*]] = alloca [9 x i8], align 1
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[SRC]], i64 6, i1 false)
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 6, i1 false)
; CHECK-NEXT: call void @use(ptr [[DEST]])
; CHECK-NEXT: ret void
;
Expand All @@ -108,7 +110,8 @@ define void @forward_offset_memcpy_inline(ptr %dep_src) {
; CHECK-NEXT: [[DEST:%.*]] = alloca [9 x i8], align 1
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false)
; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
; CHECK-NEXT: call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[SRC]], i64 6, i1 false)
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
; CHECK-NEXT: call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 6, i1 false)
; CHECK-NEXT: call void @use(ptr [[DEST]])
; CHECK-NEXT: ret void
;
Expand Down Expand Up @@ -151,7 +154,7 @@ define void @forward_offset_and_store(ptr %dep_src) {
; CHECK-NEXT: store i8 1, ptr [[DEP_SRC_END]], align 1
; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
; CHECK-NEXT: [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[SRC]], i64 5, i1 false)
; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[DEP]], i64 5, i1 false)
; CHECK-NEXT: ret void
;
%dep_dest = alloca %buf, align 1
Expand Down
12 changes: 1 addition & 11 deletions llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,16 @@
define void @forward_offset_and_store(ptr %dep_src) {
; CUSTOM-LABEL: define void @forward_offset_and_store(
; CUSTOM-SAME: ptr [[DEP_SRC:%.*]]) {
; CUSTOM-NEXT: [[DEP_DEST:%.*]] = alloca [7 x i8], align 1
; CUSTOM-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(7) [[DEP_DEST]], ptr noundef nonnull align 1 dereferenceable(7) [[DEP_SRC]], i64 7, i1 false)
; CUSTOM-NEXT: store i8 1, ptr [[DEP_SRC]], align 1
; CUSTOM-NEXT: [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 6
; CUSTOM-NEXT: store i8 1, ptr [[DEP_SRC_END]], align 1
; CUSTOM-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
; CUSTOM-NEXT: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
; CUSTOM-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(5) [[DEST]], ptr noundef nonnull align 1 dereferenceable(5) [[SRC]], i64 5, i1 false)
; CUSTOM-NEXT: ret void
;
; O2-LABEL: define void @forward_offset_and_store(
; O2-SAME: ptr nocapture [[DEP_SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; O2-NEXT: [[DEP_DEST:%.*]] = alloca [7 x i8], align 1
; O2-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(7) [[DEP_DEST]], ptr noundef nonnull align 1 dereferenceable(7) [[DEP_SRC]], i64 7, i1 false)
; O2-SAME: ptr nocapture writeonly [[DEP_SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; O2-NEXT: store i8 1, ptr [[DEP_SRC]], align 1
; O2-NEXT: [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 6
; O2-NEXT: store i8 1, ptr [[DEP_SRC_END]], align 1
; O2-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1
; O2-NEXT: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1
; O2-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(5) [[DEST]], ptr noundef nonnull align 1 dereferenceable(5) [[SRC]], i64 5, i1 false)
; O2-NEXT: ret void
;
%dep_dest = alloca %buf, align 1
Expand Down

0 comments on commit a29c0b9

Please sign in to comment.