Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FunctionAttrs] deduce attr cold on functions if all CG paths call a cold function #101298

Closed

Conversation

goldsteinn
Copy link
Contributor

  • [FunctionAttrs] Add tests for deducing attr cold on functions; NFC
  • [FunctionAttrs] deduce attr cold on functions if all CG paths call a cold function

@llvmbot
Copy link
Collaborator

llvmbot commented Jul 31, 2024

@llvm/pr-subscribers-llvm-transforms

Author: None (goldsteinn)

Changes
  • [FunctionAttrs] Add tests for deducing attr cold on functions; NFC
  • [FunctionAttrs] deduce attr cold on functions if all CG paths call a cold function

Patch is 33.32 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101298.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/IPO/FunctionAttrs.cpp (+65)
  • (added) llvm/test/Transforms/FunctionAttrs/cold.ll (+880)
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index d50218aaa3b6c..918842a33f08f 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -82,6 +82,7 @@ STATISTIC(NumNoUnwind, "Number of functions marked as nounwind");
 STATISTIC(NumNoFree, "Number of functions marked as nofree");
 STATISTIC(NumWillReturn, "Number of functions marked as willreturn");
 STATISTIC(NumNoSync, "Number of functions marked as nosync");
+STATISTIC(NumCold, "Number of functions marked as cold");
 
 STATISTIC(NumThinLinkNoRecurse,
           "Number of functions marked as norecurse during thinlink");
@@ -1745,6 +1746,7 @@ static bool canReturn(Function &F) {
   return false;
 }
 
+
 // Set the noreturn function attribute if possible.
 static void addNoReturnAttrs(const SCCNodeSet &SCCNodes,
                              SmallSet<Function *, 8> &Changed) {
@@ -1760,6 +1762,65 @@ static void addNoReturnAttrs(const SCCNodeSet &SCCNodes,
   }
 }
 
+static bool
+allBBPathsGoThroughCold(BasicBlock *BB,
+                        SmallDenseMap<BasicBlock *, bool, 16> &Visited) {
+  // If BB contains a cold callsite this path through the CG is cold.
+  if (any_of(*BB, [](Instruction &I) {
+        if (auto *CB = dyn_cast<CallBase>(&I))
+          return CB->hasFnAttr(Attribute::Cold);
+        return false;
+      })) {
+    Visited[BB] = true;
+    return true;
+  }
+
+  auto Succs = successors(BB);
+  // We found a path that doesn't go through any cold callsite.
+  if (Succs.empty())
+    return false;
+
+  for (auto *Succ : Succs) {
+    // Start with false, this is necessary to ensure we don't turn loops into
+    // cold.
+    auto R = Visited.try_emplace(Succ, false);
+    if (!R.second) {
+      if (R.first->second)
+        continue;
+      return false;
+    }
+    if (!allBBPathsGoThroughCold(Succ, Visited))
+      return false;
+    Visited[Succ] = true;
+  }
+
+  return true;
+}
+
+static bool allPathsGoThroughCold(Function &F) {
+  SmallDenseMap<BasicBlock *, bool, 16> Visited;
+  Visited[&F.front()] = false;
+  return allBBPathsGoThroughCold(&F.front(), Visited);
+}
+
+// Set the cold function attribute if possible.
+static void addColdAttrs(const SCCNodeSet &SCCNodes,
+                         SmallSet<Function *, 8> &Changed) {
+  for (Function *F : SCCNodes) {
+    if (!F || !F->hasExactDefinition() || F->hasFnAttribute(Attribute::Naked) ||
+        F->hasFnAttribute(Attribute::Cold) || F->hasFnAttribute(Attribute::Hot))
+      continue;
+
+    // Potential TODO: We could add attribute `cold` on functions with `coldcc`.
+    if (allPathsGoThroughCold(*F)) {
+      F->addFnAttr(Attribute::Cold);
+      ++NumCold;
+      Changed.insert(F);
+      continue;
+    }
+  }
+}
+
 static bool functionWillReturn(const Function &F) {
   // We can infer and propagate function attributes only when we know that the
   // definition we'll get at link time is *exactly* the definition we see now.
@@ -1789,6 +1850,7 @@ static bool functionWillReturn(const Function &F) {
   });
 }
 
+
 // Set the willreturn function attribute if possible.
 static void addWillReturn(const SCCNodeSet &SCCNodes,
                           SmallSet<Function *, 8> &Changed) {
@@ -1802,6 +1864,8 @@ static void addWillReturn(const SCCNodeSet &SCCNodes,
   }
 }
 
+
+
 static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) {
   SCCNodesResult Res;
   Res.HasUnknownCall = false;
@@ -1853,6 +1917,7 @@ deriveAttrsInPostOrder(ArrayRef<Function *> Functions, AARGetterT &&AARGetter,
   addArgumentAttrs(Nodes.SCCNodes, Changed);
   inferConvergent(Nodes.SCCNodes, Changed);
   addNoReturnAttrs(Nodes.SCCNodes, Changed);
+  addColdAttrs(Nodes.SCCNodes, Changed);
   addWillReturn(Nodes.SCCNodes, Changed);
   addNoUndefAttrs(Nodes.SCCNodes, Changed);
 
diff --git a/llvm/test/Transforms/FunctionAttrs/cold.ll b/llvm/test/Transforms/FunctionAttrs/cold.ll
new file mode 100644
index 0000000000000..a205fbda06212
--- /dev/null
+++ b/llvm/test/Transforms/FunctionAttrs/cold.ll
@@ -0,0 +1,880 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 2
+; RUN: opt -passes=function-attrs -S < %s | FileCheck --check-prefixes=COMMON,FNATTRS %s
+; RUN: opt -passes=attributor-light -S < %s | FileCheck --check-prefixes=COMMON,ATTRIBUTOR %s
+
+declare i32 @get_val()
+
+declare void @cold0() cold
+declare void @cold1() cold
+declare void @cold_at_cb()
+
+declare void @not_cold0()
+declare void @not_cold1()
+declare void @not_cold2()
+
+define void @test_no_exit_fail() {
+; COMMON: Function Attrs: nofree norecurse noreturn nosync nounwind memory(none)
+; COMMON-LABEL: define void @test_no_exit_fail
+; COMMON-SAME: () #[[ATTR1:[0-9]+]] {
+; COMMON-NEXT:  entry:
+; COMMON-NEXT:    br label [[WHILE_BODY:%.*]]
+; COMMON:       while.body:
+; COMMON-NEXT:    br label [[WHILE_BODY]]
+;
+entry:
+  br label %while.body
+
+while.body:
+  br label %while.body
+}
+
+define void @test_no_exit_fail2() {
+; COMMON: Function Attrs: noreturn
+; COMMON-LABEL: define void @test_no_exit_fail2
+; COMMON-SAME: () #[[ATTR2:[0-9]+]] {
+; COMMON-NEXT:  entry:
+; COMMON-NEXT:    br label [[WHILE_BODY:%.*]]
+; COMMON:       while.body:
+; COMMON-NEXT:    call void @not_cold0()
+; COMMON-NEXT:    br label [[WHILE_BODY2:%.*]]
+; COMMON:       while.body2:
+; COMMON-NEXT:    call void @not_cold1()
+; COMMON-NEXT:    br label [[WHILE_BODY]]
+;
+entry:
+  br label %while.body
+
+while.body:
+  call void @not_cold0()
+  br label %while.body2
+
+while.body2:
+  call void @not_cold1()
+  br label %while.body
+}
+
+define void @test_no_exit() {
+; FNATTRS: Function Attrs: cold noreturn
+; FNATTRS-LABEL: define void @test_no_exit
+; FNATTRS-SAME: () #[[ATTR3:[0-9]+]] {
+; FNATTRS-NEXT:  entry:
+; FNATTRS-NEXT:    br label [[WHILE_BODY:%.*]]
+; FNATTRS:       while.body:
+; FNATTRS-NEXT:    call void @cold0()
+; FNATTRS-NEXT:    br label [[WHILE_BODY]]
+;
+; ATTRIBUTOR: Function Attrs: noreturn
+; ATTRIBUTOR-LABEL: define void @test_no_exit
+; ATTRIBUTOR-SAME: () #[[ATTR2]] {
+; ATTRIBUTOR-NEXT:  entry:
+; ATTRIBUTOR-NEXT:    br label [[WHILE_BODY:%.*]]
+; ATTRIBUTOR:       while.body:
+; ATTRIBUTOR-NEXT:    call void @cold0()
+; ATTRIBUTOR-NEXT:    br label [[WHILE_BODY]]
+;
+entry:
+  br label %while.body
+
+while.body:
+  call void @cold0()
+  br label %while.body
+}
+
+define void @test_no_exit2() {
+; FNATTRS: Function Attrs: cold noreturn
+; FNATTRS-LABEL: define void @test_no_exit2
+; FNATTRS-SAME: () #[[ATTR3]] {
+; FNATTRS-NEXT:  entry:
+; FNATTRS-NEXT:    br label [[WHILE_BODY:%.*]]
+; FNATTRS:       while.body:
+; FNATTRS-NEXT:    call void @not_cold0()
+; FNATTRS-NEXT:    br label [[WHILE_BODY2:%.*]]
+; FNATTRS:       while.body2:
+; FNATTRS-NEXT:    call void @cold1()
+; FNATTRS-NEXT:    br label [[WHILE_BODY]]
+;
+; ATTRIBUTOR: Function Attrs: noreturn
+; ATTRIBUTOR-LABEL: define void @test_no_exit2
+; ATTRIBUTOR-SAME: () #[[ATTR2]] {
+; ATTRIBUTOR-NEXT:  entry:
+; ATTRIBUTOR-NEXT:    br label [[WHILE_BODY:%.*]]
+; ATTRIBUTOR:       while.body:
+; ATTRIBUTOR-NEXT:    call void @not_cold0()
+; ATTRIBUTOR-NEXT:    br label [[WHILE_BODY2:%.*]]
+; ATTRIBUTOR:       while.body2:
+; ATTRIBUTOR-NEXT:    call void @cold1()
+; ATTRIBUTOR-NEXT:    br label [[WHILE_BODY]]
+;
+entry:
+  br label %while.body
+
+while.body:
+  call void @not_cold0()
+  br label %while.body2
+
+while.body2:
+  call void @cold1()
+  br label %while.body
+}
+
+define dso_local void @test_entry(i32 noundef %x) {
+; FNATTRS: Function Attrs: cold
+; FNATTRS-LABEL: define dso_local void @test_entry
+; FNATTRS-SAME: (i32 noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; FNATTRS-NEXT:  entry:
+; FNATTRS-NEXT:    tail call void @cold0()
+; FNATTRS-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0
+; FNATTRS-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; FNATTRS:       if.then:
+; FNATTRS-NEXT:    tail call void @not_cold0()
+; FNATTRS-NEXT:    br label [[IF_END]]
+; FNATTRS:       if.end:
+; FNATTRS-NEXT:    tail call void @not_cold1()
+; FNATTRS-NEXT:    ret void
+;
+; ATTRIBUTOR-LABEL: define dso_local void @test_entry
+; ATTRIBUTOR-SAME: (i32 noundef [[X:%.*]]) {
+; ATTRIBUTOR-NEXT:  entry:
+; ATTRIBUTOR-NEXT:    tail call void @cold0()
+; ATTRIBUTOR-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0
+; ATTRIBUTOR-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; ATTRIBUTOR:       if.then:
+; ATTRIBUTOR-NEXT:    tail call void @not_cold0()
+; ATTRIBUTOR-NEXT:    br label [[IF_END]]
+; ATTRIBUTOR:       if.end:
+; ATTRIBUTOR-NEXT:    tail call void @not_cold1()
+; ATTRIBUTOR-NEXT:    ret void
+;
+entry:
+  tail call void @cold0()
+  %tobool.not = icmp eq i32 %x, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:
+  tail call void @not_cold0()
+  br label %if.end
+
+if.end:
+  tail call void @not_cold1()
+  ret void
+}
+
+define dso_local void @test_hot_fail(i32 noundef %x) hot {
+; FNATTRS: Function Attrs: hot
+; FNATTRS-LABEL: define dso_local void @test_hot_fail
+; FNATTRS-SAME: (i32 noundef [[X:%.*]]) #[[ATTR4:[0-9]+]] {
+; FNATTRS-NEXT:  entry:
+; FNATTRS-NEXT:    tail call void @cold0()
+; FNATTRS-NEXT:    ret void
+;
+; ATTRIBUTOR: Function Attrs: hot
+; ATTRIBUTOR-LABEL: define dso_local void @test_hot_fail
+; ATTRIBUTOR-SAME: (i32 noundef [[X:%.*]]) #[[ATTR3:[0-9]+]] {
+; ATTRIBUTOR-NEXT:  entry:
+; ATTRIBUTOR-NEXT:    tail call void @cold0()
+; ATTRIBUTOR-NEXT:    ret void
+;
+entry:
+  tail call void @cold0()
+  ret void
+}
+
+define dso_local void @test_br2(i32 noundef %x) {
+; FNATTRS: Function Attrs: cold
+; FNATTRS-LABEL: define dso_local void @test_br2
+; FNATTRS-SAME: (i32 noundef [[X:%.*]]) #[[ATTR0]] {
+; FNATTRS-NEXT:  entry:
+; FNATTRS-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0
+; FNATTRS-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; FNATTRS:       if.then:
+; FNATTRS-NEXT:    tail call void @cold0()
+; FNATTRS-NEXT:    br label [[IF_END:%.*]]
+; FNATTRS:       if.else:
+; FNATTRS-NEXT:    tail call void @cold1()
+; FNATTRS-NEXT:    br label [[IF_END]]
+; FNATTRS:       if.end:
+; FNATTRS-NEXT:    ret void
+;
+; ATTRIBUTOR-LABEL: define dso_local void @test_br2
+; ATTRIBUTOR-SAME: (i32 noundef [[X:%.*]]) {
+; ATTRIBUTOR-NEXT:  entry:
+; ATTRIBUTOR-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0
+; ATTRIBUTOR-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; ATTRIBUTOR:       if.then:
+; ATTRIBUTOR-NEXT:    tail call void @cold0()
+; ATTRIBUTOR-NEXT:    br label [[IF_END:%.*]]
+; ATTRIBUTOR:       if.else:
+; ATTRIBUTOR-NEXT:    tail call void @cold1()
+; ATTRIBUTOR-NEXT:    br label [[IF_END]]
+; ATTRIBUTOR:       if.end:
+; ATTRIBUTOR-NEXT:    ret void
+;
+entry:
+  %tobool.not = icmp eq i32 %x, 0
+  br i1 %tobool.not, label %if.else, label %if.then
+
+if.then:
+  tail call void @cold0()
+  br label %if.end
+
+if.else:
+  tail call void @cold1()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define dso_local void @test_exit(i32 noundef %x) {
+; FNATTRS: Function Attrs: cold
+; FNATTRS-LABEL: define dso_local void @test_exit
+; FNATTRS-SAME: (i32 noundef [[X:%.*]]) #[[ATTR0]] {
+; FNATTRS-NEXT:  entry:
+; FNATTRS-NEXT:    tail call void @not_cold0()
+; FNATTRS-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0
+; FNATTRS-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; FNATTRS:       if.then:
+; FNATTRS-NEXT:    tail call void @not_cold1()
+; FNATTRS-NEXT:    br label [[IF_END:%.*]]
+; FNATTRS:       if.else:
+; FNATTRS-NEXT:    tail call void @not_cold2()
+; FNATTRS-NEXT:    br label [[IF_END]]
+; FNATTRS:       if.end:
+; FNATTRS-NEXT:    tail call void @cold0()
+; FNATTRS-NEXT:    ret void
+;
+; ATTRIBUTOR-LABEL: define dso_local void @test_exit
+; ATTRIBUTOR-SAME: (i32 noundef [[X:%.*]]) {
+; ATTRIBUTOR-NEXT:  entry:
+; ATTRIBUTOR-NEXT:    tail call void @not_cold0()
+; ATTRIBUTOR-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0
+; ATTRIBUTOR-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; ATTRIBUTOR:       if.then:
+; ATTRIBUTOR-NEXT:    tail call void @not_cold1()
+; ATTRIBUTOR-NEXT:    br label [[IF_END:%.*]]
+; ATTRIBUTOR:       if.else:
+; ATTRIBUTOR-NEXT:    tail call void @not_cold2()
+; ATTRIBUTOR-NEXT:    br label [[IF_END]]
+; ATTRIBUTOR:       if.end:
+; ATTRIBUTOR-NEXT:    tail call void @cold0()
+; ATTRIBUTOR-NEXT:    ret void
+;
+entry:
+  tail call void @not_cold0()
+  %tobool.not = icmp eq i32 %x, 0
+  br i1 %tobool.not, label %if.else, label %if.then
+
+if.then:
+  tail call void @not_cold1()
+  br label %if.end
+
+if.else:
+  tail call void @not_cold2()
+  br label %if.end
+
+if.end:
+  tail call void @cold0()
+  ret void
+}
+
+define dso_local void @test_complex(i32 noundef %x) {
+; FNATTRS: Function Attrs: cold
+; FNATTRS-LABEL: define dso_local void @test_complex
+; FNATTRS-SAME: (i32 noundef [[X:%.*]]) #[[ATTR0]] {
+; FNATTRS-NEXT:  entry:
+; FNATTRS-NEXT:    tail call void @not_cold0()
+; FNATTRS-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0
+; FNATTRS-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE11:%.*]], label [[IF_THEN:%.*]]
+; FNATTRS:       if.then:
+; FNATTRS-NEXT:    [[CALL:%.*]] = tail call i32 @get_val()
+; FNATTRS-NEXT:    [[TOBOOL1_NOT:%.*]] = icmp eq i32 [[CALL]], 0
+; FNATTRS-NEXT:    br i1 [[TOBOOL1_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN2:%.*]]
+; FNATTRS:       if.then2:
+; FNATTRS-NEXT:    tail call void @cold1()
+; FNATTRS-NEXT:    br label [[IF_END12:%.*]]
+; FNATTRS:       if.else:
+; FNATTRS-NEXT:    [[CALL3:%.*]] = tail call i32 @get_val()
+; FNATTRS-NEXT:    [[TOBOOL4_NOT:%.*]] = icmp eq i32 [[CALL3]], 0
+; FNATTRS-NEXT:    br i1 [[TOBOOL4_NOT]], label [[IF_ELSE6:%.*]], label [[IF_THEN5:%.*]]
+; FNATTRS:       if.then5:
+; FNATTRS-NEXT:    tail call void @cold0()
+; FNATTRS-NEXT:    br label [[IF_END12]]
+; FNATTRS:       if.else6:
+; FNATTRS-NEXT:    tail call void @not_cold0()
+; FNATTRS-NEXT:    [[CALL7:%.*]] = tail call i32 @get_val()
+; FNATTRS-NEXT:    switch i32 [[CALL7]], label [[SW_DEFAULT:%.*]] [
+; FNATTRS-NEXT:      i32 0, label [[SW_BB:%.*]]
+; FNATTRS-NEXT:      i32 1, label [[SW_BB8:%.*]]
+; FNATTRS-NEXT:      i32 2, label [[SW_BB9:%.*]]
+; FNATTRS-NEXT:    ]
+; FNATTRS:       sw.bb:
+; FNATTRS-NEXT:    tail call void @not_cold0()
+; FNATTRS-NEXT:    br label [[CALL_COLD:%.*]]
+; FNATTRS:       sw.bb8:
+; FNATTRS-NEXT:    tail call void @not_cold1()
+; FNATTRS-NEXT:    br label [[CALL_COLD]]
+; FNATTRS:       sw.bb9:
+; FNATTRS-NEXT:    tail call void @not_cold2()
+; FNATTRS-NEXT:    br label [[CALL_COLD]]
+; FNATTRS:       sw.default:
+; FNATTRS-NEXT:    tail call void @cold0()
+; FNATTRS-NEXT:    br label [[IF_END12]]
+; FNATTRS:       call_cold:
+; FNATTRS-NEXT:    tail call void @cold_at_cb() #[[ATTR0]]
+; FNATTRS-NEXT:    br label [[IF_END12]]
+; FNATTRS:       if.else11:
+; FNATTRS-NEXT:    tail call void @cold0()
+; FNATTRS-NEXT:    br label [[IF_END12]]
+; FNATTRS:       if.end12:
+; FNATTRS-NEXT:    ret void
+;
+; ATTRIBUTOR-LABEL: define dso_local void @test_complex
+; ATTRIBUTOR-SAME: (i32 noundef [[X:%.*]]) {
+; ATTRIBUTOR-NEXT:  entry:
+; ATTRIBUTOR-NEXT:    tail call void @not_cold0()
+; ATTRIBUTOR-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0
+; ATTRIBUTOR-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE11:%.*]], label [[IF_THEN:%.*]]
+; ATTRIBUTOR:       if.then:
+; ATTRIBUTOR-NEXT:    [[CALL:%.*]] = tail call i32 @get_val()
+; ATTRIBUTOR-NEXT:    [[TOBOOL1_NOT:%.*]] = icmp eq i32 [[CALL]], 0
+; ATTRIBUTOR-NEXT:    br i1 [[TOBOOL1_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN2:%.*]]
+; ATTRIBUTOR:       if.then2:
+; ATTRIBUTOR-NEXT:    tail call void @cold1()
+; ATTRIBUTOR-NEXT:    br label [[IF_END12:%.*]]
+; ATTRIBUTOR:       if.else:
+; ATTRIBUTOR-NEXT:    [[CALL3:%.*]] = tail call i32 @get_val()
+; ATTRIBUTOR-NEXT:    [[TOBOOL4_NOT:%.*]] = icmp eq i32 [[CALL3]], 0
+; ATTRIBUTOR-NEXT:    br i1 [[TOBOOL4_NOT]], label [[IF_ELSE6:%.*]], label [[IF_THEN5:%.*]]
+; ATTRIBUTOR:       if.then5:
+; ATTRIBUTOR-NEXT:    tail call void @cold0()
+; ATTRIBUTOR-NEXT:    br label [[IF_END12]]
+; ATTRIBUTOR:       if.else6:
+; ATTRIBUTOR-NEXT:    tail call void @not_cold0()
+; ATTRIBUTOR-NEXT:    [[CALL7:%.*]] = tail call i32 @get_val()
+; ATTRIBUTOR-NEXT:    switch i32 [[CALL7]], label [[SW_DEFAULT:%.*]] [
+; ATTRIBUTOR-NEXT:      i32 0, label [[SW_BB:%.*]]
+; ATTRIBUTOR-NEXT:      i32 1, label [[SW_BB8:%.*]]
+; ATTRIBUTOR-NEXT:      i32 2, label [[SW_BB9:%.*]]
+; ATTRIBUTOR-NEXT:    ]
+; ATTRIBUTOR:       sw.bb:
+; ATTRIBUTOR-NEXT:    tail call void @not_cold0()
+; ATTRIBUTOR-NEXT:    br label [[CALL_COLD:%.*]]
+; ATTRIBUTOR:       sw.bb8:
+; ATTRIBUTOR-NEXT:    tail call void @not_cold1()
+; ATTRIBUTOR-NEXT:    br label [[CALL_COLD]]
+; ATTRIBUTOR:       sw.bb9:
+; ATTRIBUTOR-NEXT:    tail call void @not_cold2()
+; ATTRIBUTOR-NEXT:    br label [[CALL_COLD]]
+; ATTRIBUTOR:       sw.default:
+; ATTRIBUTOR-NEXT:    tail call void @cold0()
+; ATTRIBUTOR-NEXT:    br label [[IF_END12]]
+; ATTRIBUTOR:       call_cold:
+; ATTRIBUTOR-NEXT:    tail call void @cold_at_cb() #[[ATTR0:[0-9]+]]
+; ATTRIBUTOR-NEXT:    br label [[IF_END12]]
+; ATTRIBUTOR:       if.else11:
+; ATTRIBUTOR-NEXT:    tail call void @cold0()
+; ATTRIBUTOR-NEXT:    br label [[IF_END12]]
+; ATTRIBUTOR:       if.end12:
+; ATTRIBUTOR-NEXT:    ret void
+;
+entry:
+  tail call void @not_cold0()
+  %tobool.not = icmp eq i32 %x, 0
+  br i1 %tobool.not, label %if.else11, label %if.then
+
+if.then:
+  %call = tail call i32 @get_val()
+  %tobool1.not = icmp eq i32 %call, 0
+  br i1 %tobool1.not, label %if.else, label %if.then2
+
+if.then2:
+  tail call void @cold1()
+  br label %if.end12
+
+if.else:
+  %call3 = tail call i32 @get_val()
+  %tobool4.not = icmp eq i32 %call3, 0
+  br i1 %tobool4.not, label %if.else6, label %if.then5
+
+if.then5:
+  tail call void @cold0()
+  br label %if.end12
+
+if.else6:
+  tail call void @not_cold0()
+  %call7 = tail call i32 @get_val()
+  switch i32 %call7, label %sw.default [
+  i32 0, label %sw.bb
+  i32 1, label %sw.bb8
+  i32 2, label %sw.bb9
+  ]
+
+sw.bb:
+  tail call void @not_cold0()
+  br label %call_cold
+
+sw.bb8:
+  tail call void @not_cold1()
+  br label %call_cold
+
+sw.bb9:
+  tail call void @not_cold2()
+  br label %call_cold
+
+sw.default:
+  tail call void @cold0()
+  br label %if.end12
+
+call_cold:
+  tail call void @cold_at_cb() cold
+  br label %if.end12
+
+if.else11:
+  tail call void @cold0()
+  br label %if.end12
+
+if.end12:
+  ret void
+}
+
+define dso_local void @test_complex2(i32 noundef %x) {
+; FNATTRS: Function Attrs: cold
+; FNATTRS-LABEL: define dso_local void @test_complex2
+; FNATTRS-SAME: (i32 noundef [[X:%.*]]) #[[ATTR0]] {
+; FNATTRS-NEXT:  entry:
+; FNATTRS-NEXT:    tail call void @not_cold0()
+; FNATTRS-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[X]], 0
+; FNATTRS-NEXT:    [[CALL12:%.*]] = tail call i32 @get_val()
+; FNATTRS-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_ELSE11:%.*]], label [[IF_THEN:%.*]]
+; FNATTRS:       if.then:
+; FNATTRS-NEXT:    [[TOBOOL1_NOT:%.*]] = icmp eq i32 [[CALL12]], 0
+; FNATTRS-NEXT:    br i1 [[TOBOOL1_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN2:%.*]]
+; FNATTRS:       if.then2:
+; FNATTRS-NEXT:    tail call void @cold1()
+; FNATTRS-NEXT:    br label [[IF_END16:%.*]]
+; FNATTRS:       if.else:
+; FNATTRS-NEXT:    [[CALL3:%.*]] = tail call i32 @get_val()
+; FNATTRS-NEXT:    [[TOBOOL4_NOT:%.*]] = icmp eq i32 [[CALL3]], 0
+; FNATTRS-NEXT:    br i1 [[TOBOOL4_NOT]], label [[IF_ELSE6:%.*]], label [[IF_THEN5:%.*]]
+; FNATTRS:       if.then5:
+; FNATTRS-NEXT:    tail call void @cold0()
+; FNATTRS-NEXT:    br label [[IF_END16]]
+; FNATTRS:       if.else6:
+; FNATTRS-NEXT:    tail call void @not_cold0()
+; FNATTRS-NEXT:    [[CALL7:%.*]] = tail call i32 @get_val()
+; FNATTRS-NEXT:    switch i32 [[CALL7]], label [[SW_DEFAULT:%.*]] [
+; FNATTRS-NEXT:      i32 0, label [[SW_BB:%.*]]
+; FNATTRS-NEXT:      i32 1, label [[SW_BB8:%.*]]
+; FNATTRS-NEXT:      i32 2, label [[SW_BB9:%.*]]
+; FNATTRS-NEXT:    ]
+; FNATTRS:       sw.bb:
+; FNATTRS-NEXT:    tail call void @not_cold0()
+; FNATTRS-NEXT:    br label [[CALL_COLD:%.*]]
+; FNATTRS:       sw.bb8:
+; FNATTRS-NEXT:    tail call void @not_cold1()
+; FNATTRS-NEXT:    br label [[CALL_COLD]]
+; FNATTRS:       sw.bb9:
+; FNATTRS-NEXT:    tail call void @not_cold2(...
[truncated]

@goldsteinn goldsteinn changed the title perf/goldsteinn/deduce cold attrs [FunctionAttrs] deduce attr cold on functions if all CG paths call a cold function Jul 31, 2024
Copy link

github-actions bot commented Jul 31, 2024

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:
git-clang-format --diff 7a80c86726f7c37128bfee3618707c1607f5014d 08d018f6d2b792ccedb462e339ab9fd83758b785 --extensions cpp -- llvm/lib/Transforms/IPO/FunctionAttrs.cpp
View the diff from clang-format here.
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 603a1565e4..53568ae35d 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -1746,7 +1746,6 @@ static bool canReturn(Function &F) {
   return false;
 }
 
-
 // Set the noreturn function attribute if possible.
 static void addNoReturnAttrs(const SCCNodeSet &SCCNodes,
                              SmallSet<Function *, 8> &Changed) {

}

static bool allPathsGoThroughCold(Function &F) {
SmallDenseMap<BasicBlock *, bool, 16> Visited;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be simpler to turn this into a BFS traversal?

SmallVector<BB> WorkList, Visited;
WorkList.push(entryBB);

while(!WorkList.empty()) {
  BB = WorkList.pop();
  if (BB contains cold calls)
     continue;

  if (BB contains return/calls without willreturn)
     return true;

   Visited.insert(BB);
  
   for (auto *SuccBB: successors(BB))
       if (!Visited.contains(SuccBB))
           WorkList.push_back(SuccBB);
}
return false;

Copy link
Contributor Author

@goldsteinn goldsteinn Jul 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thats what I originally had, but was running into issues with inf loops. (edit: not LLVM inf loops, but marking inf loops as cold).

Basically we need more information that just "did we visit this", we need "did we visit this and find a cold call site". I don't really know how to do that without depth-first.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The basic calculus I had was we can either be marking:

entry:
  br label %loop
loop:
  br label %loop

as cold or we can miss the case:

entry:
br label %loop
loop:
%c = <some val>
br i1 %c, label %loop, label %done
done:
<cold call>

and think its better to err on under-attribution rather than over.

Copy link
Contributor Author

@goldsteinn goldsteinn Jul 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(we could of course handle this with a dom tree, but I don't think its worth the compile time/complexity). I think most functions we will mark end up essentially being error wrappers which don't have very complex CGs.

@goldsteinn
Copy link
Contributor Author

Okay, this has a notable negative impact on LTO compile times. Will try to fix.

@goldsteinn
Copy link
Contributor Author

goldsteinn commented Jul 31, 2024

Okay, this has a notable negative impact on LTO compile times. Will try to fix.

Looked a bit, it seems like the compile time on LTO is all from LTO using the added cold attrs, not the time spent in FunctionAttrs.

Is this potentially acceptable?

Edit: 90% of the added compile time in ThinLTO seems to be just from the additional cold attrs. I'm going to run SPEC and see if this is yielding fruitful results.

@nikic
Copy link
Contributor

nikic commented Jul 31, 2024

Is this potentially acceptable?

Generally yes, second-order regressions are fine. Of course, it depends on the details. How does the presence of cold attributes affect things -- is this "just" because of substantially different inlining decisions, or something else?

@goldsteinn
Copy link
Contributor Author

goldsteinn commented Jul 31, 2024

Results from running SPEC:
Spec 2x Runs (I dropped specrand as it finishes in < .001 sec and has
+-25% diff which skews the results).

Left hand side (results0*) are with the change, Right hand side (results1*) are without the change.

Program                                       exec_time
                                              results0-0 results1-0 diff
INT2017spe...ed/620.omnetpp_s/620.omnetpp_s    26.82      27.66      3.1%
INT2017rate/525.x264_r/525.x264_r              10.06      10.12      0.6%
INT2017rate/505.mcf_r/505.mcf_r                36.31      36.49      0.5%
INT2017speed/625.x264_s/625.x264_s             10.06      10.11      0.5%
INT2017rate/557.xz_r/557.xz_r                  22.08      22.18      0.4%
INT2017spe...31.deepsjeng_s/631.deepsjeng_s    41.42      41.59      0.4%
FP2017rate/544.nab_r/544.nab_r                 36.35      36.49      0.4%
INT2017spe...00.perlbench_s/600.perlbench_s    10.79      10.83      0.4%
FP2017rate/519.lbm_r/519.lbm_r                 12.17      12.21      0.4%
FP2017speed/644.nab_s/644.nab_s                36.33      36.46      0.4%
FP2017speed/638.imagick_s/638.imagick_s        11.98      12.01      0.2%
FP2017rate/511.povray_r/511.povray_r            1.79       1.80      0.2%
INT2017spe...23.xalancbmk_s/623.xalancbmk_s    17.56      17.60      0.2%
INT2017rate/541.leela_r/541.leela_r            44.06      44.12      0.1%
INT2017rate/520.omnetpp_r/520.omnetpp_r        26.99      27.01      0.1%
                           Geomean difference                        0.2%
       exec_time
run   results0-0 results1-0       diff
count  28.000000  28.000000  28.000000
mean   27.327485  27.359046  0.002049
std    19.987921  19.931984  0.006528
min    1.792980   1.797220  -0.006428
25%    12.126350  12.158975 -0.000629
50%    21.983175  21.959215  0.001016
75%    36.336880  36.467720  0.003874
max    96.006300  95.838400  0.031301

Program                                       exec_time
                                              results0-1 results1-1 diff
INT2017rate/520.omnetpp_r/520.omnetpp_r        26.54      26.92      1.4%
INT2017rate/557.xz_r/557.xz_r                  21.92      22.06      0.7%
INT2017speed/625.x264_s/625.x264_s             10.08      10.14      0.6%
INT2017spe...00.perlbench_s/600.perlbench_s    10.81      10.87      0.5%
INT2017rat...00.perlbench_r/500.perlbench_r    10.80      10.85      0.4%
FP2017rate/544.nab_r/544.nab_r                 36.41      36.56      0.4%
FP2017rate/519.lbm_r/519.lbm_r                 12.16      12.20      0.4%
INT2017rate/525.x264_r/525.x264_r              10.09      10.13      0.3%
FP2017rate/508.namd_r/508.namd_r               12.91      12.95      0.3%
FP2017rate/510.parest_r/510.parest_r           21.89      21.95      0.3%
INT2017spe...31.deepsjeng_s/631.deepsjeng_s    41.47      41.59      0.3%
FP2017rate/526.blender_r/526.blender_r         70.29      70.46      0.2%
INT2017speed/657.xz_s/657.xz_s                 22.00      22.04      0.2%
FP2017speed/619.lbm_s/619.lbm_s                95.80      95.96      0.2%
INT2017speed/605.mcf_s/605.mcf_s               36.18      36.23      0.1%
                           Geomean difference                        0.1%
       exec_time
run   results0-1 results1-1       diff
count  28.000000  28.000000  28.000000
mean   27.309399  27.338671  0.001482
std    19.946388  19.965449  0.004339
min    1.798940   1.800750  -0.010338
25%    12.123775  12.156375 -0.001120
50%    21.901365  21.993835  0.001520
75%    36.258345  36.315370  0.003516
max    95.796100  95.958700  0.014375

Seems so-so, I think probably an improvement on omnetpp, but mostly noise.

@goldsteinn
Copy link
Contributor Author

goldsteinn commented Jul 31, 2024

Is this potentially acceptable?

Generally yes, second-order regressions are fine. Of course, it depends on the details. How does the presence of cold attributes affect things -- is this "just" because of substantially different inlining decisions, or something else?

So looking at the diff in stats from before/after (diff below) seems like some small changes across the board, no number really jumps out. Inlining does have a notable (ish) change with less inlines after the change. Although not sure how to read the numbers to interpret whats leading to the compile time regression.

Edit: My hypothesis is probably on the lines, some of the now cold functions where fully inlined resulting in less functions in later passes. If thats the case I'd argue we probably shouldn't have been inlining those cold functions so I'd say this is a beneficial change.

The + is with the change:

@@ -1,6 +1,6 @@
-         1289338 aa                           - Number of MayAlias results
-          174178 aa                           - Number of MustAlias results
-         5817260 aa                           - Number of NoAlias results
+         1289144 aa                           - Number of MayAlias results
+          174197 aa                           - Number of MustAlias results
+         5816357 aa                           - Number of NoAlias results
            31029 abstract-call-sites          - Number of direct abstract call sites created
               96 abstract-call-sites          - Number of invalid abstract call sites created (no callback)
              392 abstract-call-sites          - Number of invalid abstract call sites created (unknown use)
@@ -11,9 +11,9 @@
              220 argpromotion                 - Number of dead pointer args eliminated
             1275 argpromotion                 - Number of pointer arguments promoted
            33178 assume-queries               - Number of Queries into an assume assume bundles
-         7841528 basicaa                      - Number of times a GEP is decomposed
+         7843168 basicaa                      - Number of times a GEP is decomposed
            13619 basicaa                      - Number of times the limit to decompose GEPs is reached
-            3895 bdce                         - Number of instructions removed (unused)
+            3899 bdce                         - Number of instructions removed (unused)
               22 bdce                         - Number of instructions trivialized (dead bits)
              194 bdce                         - Number of sign extension instructions converted to zero extension
             2091 build-libcalls               - Number of arguments inferred as nocapture
@@ -31,64 +31,64 @@
              614 build-libcalls               - Number of functions inferred as willreturn
              130 build-libcalls               - Number of functions inferred as writeonly
               44 callsite-splitting           - Number of call-site split
-           59752 capture-tracking             - Number of pointers maybe captured
-           20192 capture-tracking             - Number of pointers maybe captured before
-           12565 capture-tracking             - Number of pointers not captured
-            8428 capture-tracking             - Number of pointers not captured before
-             861 constraint-elimination       - Number of instructions removed
+           59767 capture-tracking             - Number of pointers maybe captured
+           20202 capture-tracking             - Number of pointers maybe captured before
+           12570 capture-tracking             - Number of pointers not captured
+            8431 capture-tracking             - Number of pointers not captured before
+             860 constraint-elimination       - Number of instructions removed
               69 correlated-value-propagation - Number of ands removed
-              49 correlated-value-propagation - Number of ashr converted to lshr
+              46 correlated-value-propagation - Number of ashr converted to lshr
                6 correlated-value-propagation - Number of bound udiv's/urem's expanded
-             420 correlated-value-propagation - Number of comparisons propagated
-            8595 correlated-value-propagation - Number of function pointer arguments marked non-null
+             424 correlated-value-propagation - Number of comparisons propagated
+            8591 correlated-value-propagation - Number of function pointer arguments marked non-null
               61 correlated-value-propagation - Number of llvm.[us]{min,max} intrinsics removed
                6 correlated-value-propagation - Number of llvm.abs intrinsics removed
               73 correlated-value-propagation - Number of llvm.s{min,max} intrinsics simplified to unsigned
-             977 correlated-value-propagation - Number of no-signed-wrap deductions
-             718 correlated-value-propagation - Number of no-signed-wrap deductions for add
+             976 correlated-value-propagation - Number of no-signed-wrap deductions
+             717 correlated-value-propagation - Number of no-signed-wrap deductions for add
               23 correlated-value-propagation - Number of no-signed-wrap deductions for mul
              121 correlated-value-propagation - Number of no-signed-wrap deductions for shl
              115 correlated-value-propagation - Number of no-signed-wrap deductions for sub
-            1177 correlated-value-propagation - Number of no-unsigned-wrap deductions
-             805 correlated-value-propagation - Number of no-unsigned-wrap deductions for add
+            1176 correlated-value-propagation - Number of no-unsigned-wrap deductions
+             804 correlated-value-propagation - Number of no-unsigned-wrap deductions for add
               26 correlated-value-propagation - Number of no-unsigned-wrap deductions for mul
              243 correlated-value-propagation - Number of no-unsigned-wrap deductions for shl
              103 correlated-value-propagation - Number of no-unsigned-wrap deductions for sub
-            2149 correlated-value-propagation - Number of no-wrap deductions
-            1523 correlated-value-propagation - Number of no-wrap deductions for add
+            2147 correlated-value-propagation - Number of no-wrap deductions
+            1521 correlated-value-propagation - Number of no-wrap deductions for add
               49 correlated-value-propagation - Number of no-wrap deductions for mul
              364 correlated-value-propagation - Number of no-wrap deductions for shl
              213 correlated-value-propagation - Number of no-wrap deductions for sub
              306 correlated-value-propagation - Number of phis deleted via common incoming value
-            5790 correlated-value-propagation - Number of phis propagated
+            5780 correlated-value-propagation - Number of phis propagated
               43 correlated-value-propagation - Number of sdiv converted to udiv
               41 correlated-value-propagation - Number of sdivs/srems whose width was decreased
              839 correlated-value-propagation - Number of selects propagated
-            1235 correlated-value-propagation - Number of sext converted to zext
+            1234 correlated-value-propagation - Number of sext converted to zext
              432 correlated-value-propagation - Number of signed icmp preds simplified to unsigned
               23 correlated-value-propagation - Number of sitofp converted to uitofp
                1 correlated-value-propagation - Number of srem converted to urem
-              82 correlated-value-propagation - Number of switch cases removed
+              81 correlated-value-propagation - Number of switch cases removed
               42 correlated-value-propagation - Number of udivs/urems whose width was decreased
              364 correlated-value-propagation - Number of zext/uitofp non-negative deductions
-             645 count-visits                 - Max number of times we visited a function
-              27 deadargelim                  - Number of unread args removed
+             646 count-visits                 - Max number of times we visited a function
+              29 deadargelim                  - Number of unread args removed
              135 deadargelim                  - Number of unread args replaced with poison
               36 deadargelim                  - Number of unused return values removed
-          214229 dse                          - Number iterations check for reads in getDomMemoryDef
+          214203 dse                          - Number iterations check for reads in getDomMemoryDef
              464 dse                          - Number of other instrs removed
-             179 dse                          - Number of redundant stores deleted
+             180 dse                          - Number of redundant stores deleted
              201 dse                          - Number of stores dead by later partials
             5173 dse                          - Number of stores deleted
             5673 dse                          - Number of stores modified
-          195126 dse                          - Number of stores remaining after DSE
+          195185 dse                          - Number of stores remaining after DSE
             7139 dse                          - Number of times a valid candidate is returned from getDomMemoryDef
-           84533 early-cse                    - Number of GEP instructions CSE'd
+           84529 early-cse                    - Number of GEP instructions CSE'd
               32 early-cse                    - Number of call instructions CSE'd
              342 early-cse                    - Number of compare instructions CVP'd
-           37737 early-cse                    - Number of instructions CSE'd
+           37735 early-cse                    - Number of instructions CSE'd
           129710 early-cse                    - Number of instructions simplified or DCE'd
-           69942 early-cse                    - Number of load instructions CSE'd
+           69899 early-cse                    - Number of load instructions CSE'd
             2318 early-cse                    - Number of trivial dead stores removed
            56845 file-search                  - Number of #includes skipped due to the multi-include optimization.
           162113 file-search                  - Number of attempted #includes.
@@ -97,9 +97,10 @@
             6621 function-attrs               - Number of arguments marked readonly
              907 function-attrs               - Number of arguments marked returned
             2196 function-attrs               - Number of arguments marked writeonly
-             435 function-attrs               - Number of function returns marked noalias
+             437 function-attrs               - Number of function returns marked noalias
              447 function-attrs               - Number of function returns marked nonnull
              793 function-attrs               - Number of function returns marked noundef
+              84 function-attrs               - Number of functions marked as cold
             3442 function-attrs               - Number of functions marked as nofree
             5949 function-attrs               - Number of functions marked as norecurse
             6060 function-attrs               - Number of functions marked as nosync
@@ -127,87 +128,87 @@
             1772 globalsmodref-aa             - Number of global vars without address taken
               23 globalsmodref-aa             - Number of indirect global objects
             4928 gvn                          - Number of blocks merged
-            7133 gvn                          - Number of blocks speculated as available in IsValueFullyAvailableInBlock(), max
-            2022 gvn                          - Number of equalities propagated
+            7138 gvn                          - Number of blocks speculated as available in IsValueFullyAvailableInBlock(), max
+            2020 gvn                          - Number of equalities propagated
              856 gvn                          - Number of instructions PRE'd
-           62052 gvn                          - Number of instructions deleted
-           22145 gvn                          - Number of instructions simplified
-            7134 gvn                          - Number of loads PRE'd
-           11697 gvn                          - Number of loads deleted
+           62128 gvn                          - Number of instructions deleted
+           22152 gvn                          - Number of instructions simplified
+            7142 gvn                          - Number of loads PRE'd
+           11741 gvn                          - Number of loads deleted
              822 gvn                          - Number of loads moved to predecessor of a critical edge in PRE
              128 gvn                          - Number of loop loads PRE'd
              232 indvars                      - Number of IV comparisons eliminated
               26 indvars                      - Number of IV identities eliminated
                2 indvars                      - Number of IV remainder operations eliminated
-            8702 indvars                      - Number of IV sign/zero extends eliminated
+            8704 indvars                      - Number of IV sign/zero extends eliminated
                1 indvars                      - Number of IV signed remainder operations converted to unsigned remainder
                8 indvars                      - Number of IV users folded into a constant
             6990 indvars                      - Number of congruent IVs eliminated
              570 indvars                      - Number of exit values replaced
             7122 indvars                      - Number of indvars widened
             5410 indvars                      - Number of loop exit tests replaced
-           27730 inline                       - Number of functions deleted because all callers found
-          106114 inline                       - Number of functions inlined
-          132759 inline-cost                  - Number of call sites analyzed
+           27703 inline                       - Number of functions deleted because all callers found
+          105963 inline                       - Number of functions inlined
+          132883 inline-cost                  - Number of call sites analyzed
               27 instcombine                  - Negator: How many negations did we retrieve/reuse from cache
               99 instcombine                  - Negator: Maximal number of new instructions created during negation attempt
              611 instcombine                  - Negator: Maximal number of values ever visited while attempting to sink negation
              177 instcombine                  - Negator: Maximal traversal depth ever reached while attempting to sink negation
-           89540 instcombine                  - Negator: Number of negations attempted to be sinked
+           89555 instcombine                  - Negator: Number of negations attempted to be sinked
              197 instcombine                  - Negator: Number of negations successfully sinked
              225 instcombine                  - Negator: Number of new negated instructions created in successful negation sinking attempts
              262 instcombine                  - Negator: Number of new negated instructions created, total
-           94420 instcombine                  - Negator: Total number of values visited during attempts to sink negation
+           94435 instcombine                  - Negator: Total number of values visited during attempts to sink negation
             2655 instcombine                  - Number of PHI's that got CSE'd
               34 instcombine                  - Number of allocas copied from constant global
              335 instcombine                  - Number of constant folds
-          111540 instcombine                  - Number of dead inst eliminated
+          111544 instcombine                  - Number of dead inst eliminated
              220 instcombine                  - Number of dead stores eliminated
               13 instcombine                  - Number of expansions
              199 instcombine                  - Number of factorizations
-          163239 instcombine                  - Number of functions with one iteration
-           52240 instcombine                  - Number of functions with two iterations
-          215479 instcombine                  - Number of instruction combining iterations performed
+          163257 instcombine                  - Number of functions with one iteration
+           52234 instcombine                  - Number of functions with two iterations
+          215491 instcombine                  - Number of instruction combining iterations performed
            11604 instcombine                  - Number of instructions sunk
-          560025 instcombine                  - Number of insts combined
+          560006 instcombine                  - Number of insts combined
             2322 instcombine                  - Number of library calls simplified
             4530 instcombine                  - Number of phi-of-extractvalue turned into extractvalue-of-phi
                2 instcombine                  - Number of phi-of-insertvalue turned into insertvalue-of-phis
-            1717 instcombine                  - Number of reassociations
+            1719 instcombine                  - Number of reassociations
               24 instcombine                  - Number of select opts
              458 instsimplify                 - Number of expansions
-           17844 instsimplify                 - Number of reassociations
-         2425103 ipt                          - Number of insts scanned while updating ibt
-          102509 ir                           - Number of renumberings across all blocks
+           17846 instsimplify                 - Number of reassociations
+         2424546 ipt                          - Number of insts scanned while updating ibt
+          102517 ir                           - Number of renumberings across all blocks
              140 jump-threading               - Number of branch blocks duplicated to eliminate phi
-            8776 jump-threading               - Number of jumps threaded
-            6600 jump-threading               - Number of terminators folded
-           51881 lcssa                        - Number of live out of a loop variables
+            8779 jump-threading               - Number of jumps threaded
+            6606 jump-threading               - Number of terminators folded
+           51874 lcssa                        - Number of live out of a loop variables
                5 licm                         - Number of add/subtract expressions reassociated and hoisted out of the loop
              304 licm                         - Number of call insts hoisted or sunk
              446 licm                         - Number of geps reassociated and hoisted out of the loop
-           59211 licm                         - Number of instructions hoisted out of loop
+           59221 licm                         - Number of instructions hoisted out of loop
             1300 licm                         - Number of instructions sunk out of loop
              221 licm                         - Number of invariant BinaryOp expressions reassociated and hoisted out of the loop
                2 licm                         - Number of invariant int expressions reassociated and hoisted out of the loop
              564 licm                         - Number of load and store promotions
-           10957 licm                         - Number of load insts hoisted or sunk
+           10966 licm                         - Number of load insts hoisted or sunk
              531 licm                         - Number of load-only promotions
                8 licm                         - Number of min/max expressions hoisted out of the loop
             2658 licm                         - Number of promotion candidates
             1314 local                        - Number of PHI's that got CSE'd
-            5025 local                        - Number of unreachable basic blocks removed
+            4864 local                        - Number of unreachable basic blocks removed
             1768 loop-delete                  - Number of loops deleted
               94 loop-delete                  - Number of loops for which we managed to break the backedge
              156 loop-idiom                   - Number of memcpy's formed from loop load+stores
                1 loop-idiom                   - Number of memmove's formed from loop load+stores
              457 loop-idiom                   - Number of memset's formed from loop stores
                1 loop-idiom                   - Number of uncountable loops recognized as 'shift until zero' idiom
-            1849 loop-instsimplify            - Number of redundant instructions simplified
+            1846 loop-instsimplify            - Number of redundant instructions simplified
               62 loop-peel                    - Number of loops peeled
            28409 loop-rotate                  - Number of instructions cloned into loop preheader
               28 loop-rotate                  - Number of loops not rotated due to the header size
-           10892 loop-rotate                  - Number of loops rotated
+           10891 loop-rotate                  - Number of loops rotated
               75 loop-simplify                - Number of nested loops split out
               62 loop-simplifycfg             - Number of loop blocks deleted
               14 loop-simplifycfg             - Number of loop exiting edges deleted
@@ -225,19 +226,19 @@
               43 memcpyopt                    - Number of memcpys converted to memset
                5 memcpyopt                    - Number of memmoves converted to memcpy
             1601 memcpyopt                    - Number of memsets inferred
-           29829 memdep                       - Number of block queries that were completely cached
+           29835 memdep                       - Number of block queries that were completely cached
              174 memdep                       - Number of cached, but dirty, non-local ptr responses
-         2283554 memdep                       - Number of fully cached non-local ptr responses
+         2282925 memdep                       - Number of fully cached non-local ptr responses
              147 memdep                       - Number of fully cached non-local responses
-         1423784 memdep                       - Number of uncached non-local ptr responses
+         1424029 memdep                       - Number of uncached non-local ptr responses
              188 memdep                       - Number of uncached non-local responses
            28622 memory-builtins              - Number of arguments with unsolved size and offset
-           29952 memory-builtins              - Number of load instructions with unsolved size and offset
+           29962 memory-builtins              - Number of load instructions with unsolved size and offset
               41 reassociate                  - Number of expr tree annihilated
-           28562 reassociate                  - Number of insts reassociated
+           28556 reassociate                  - Number of insts reassociated
                9 reassociate                  - Number of multiplies factored
-           14865 scalar-evolution             - Number of loop exits with predictable exit counts
-           28529 scalar-evolution             - Number of loop exits without predictable exit counts
+           14863 scalar-evolution             - Number of loop exits with predictable exit counts
+           28544 scalar-evolution             - Number of loop exits without predictable exit counts
              283 scalar-evolution             - Number of loops with trip counts computed by force
              299 sccp                         - Number of arguments constant propagated
             3544 sccp                         - Number of basic blocks unreachable
@@ -249,7 +250,7 @@
                3 simple-loop-unswitch         - Number of switches unswitched
               88 simple-loop-unswitch         - Number of unswitch candidates that had their cost multiplier skipped
               39 simple-loop-unswitch         - Number of unswitches that are trivial
-          166289 simplifycfg                  - Number of blocks simplified
+          166262 simplifycfg                  - Number of blocks simplified
             5572 simplifycfg                  - Number of branches folded into predecessor basic block
             4033 simplifycfg                  - Number of common instruction 'blocks' hoisted up to the begin block
             1967 simplifycfg                  - Number of common instruction 'blocks' sunk down to the end block
@@ -265,7 +266,7 @@
            27496 sroa                         - Maximum number of uses of a partition
           767455 sroa                         - Number of alloca partition uses rewritten
           162506 sroa                         - Number of alloca partitions formed
-          190692 sroa                         - Number of allocas analyzed for replacement
+          190686 sroa                         - Number of allocas analyzed for replacement
           160979 sroa                         - Number of allocas promoted to SSA values
           778893 sroa                         - Number of instructions deleted
                6 sroa                         - Number of loads rewritten into predicated loads to allow promotion

@aeubanks
Copy link
Contributor

aeubanks commented Aug 1, 2024

I'm not sure that smaller but more functions necessarily means more compile time, larger functions tend to exasperate superlinear compile times.

Can you post a llvm-compile-time-tracker link (even though it won't necessarily show the second order effects since you tend to need FDO profiles for mark functions as cold)?

@goldsteinn
Copy link
Contributor Author

goldsteinn commented Aug 1, 2024

I'm not sure that smaller but more functions necessarily means more compile time, larger functions tend to exasperate superlinear compile times.

Can you post a llvm-compile-time-tracker link (even though it won't necessarily show the second order effects since you tend to need FDO profiles for mark functions as cold)?

The original impact (of this exact patch):
https://llvm-compile-time-tracker.com/compare.php?from=ffcbfa46b9926bcb42dd7b1f778ada113672595c&to=85efc82366a64fbe5287b7bc9d20853b06d147f6&stat=instructions:u

A test showing the impact as second order:
https://llvm-compile-time-tracker.com/compare.php?from=eb0d0125306e201fbff49c1b00a059c95757f595&to=bb96b08e7c97548bc1242f98b18bf07ba42be1d8&stat=instructions:u

The test basically replaces the setting of cold to setting of nosanitize_bounds in the first commit but results in the same code executing in FunctionAttrs. As you can see in the second commit, when all I do is change the set attr to cold the compile time explodes.

Edit: NB when I ran locally I didn't see any occurrences of nosanitize_bounds. Also verified that the NumCold stat matches w/ the dummy version.

@goldsteinn
Copy link
Contributor Author

ping

llvm/lib/Transforms/IPO/FunctionAttrs.cpp Show resolved Hide resolved
F->hasFnAttribute(Attribute::Cold) || F->hasFnAttribute(Attribute::Hot))
continue;

// Potential TODO: We could add attribute `cold` on functions with `coldcc`.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove this todo. It has been implemented in GlobalOpt:

if (hasChangeableCC(&F, ChangeableCCCache)) {
NumInternalFunc++;
TargetTransformInfo &TTI = GetTTI(F);
// Change the calling convention to coldcc if either stress testing is
// enabled or the target would like to use coldcc on functions which are
// cold at all call sites and the callers contain no other non coldcc
// calls.
if (EnableColdCCStressTest ||
(TTI.useColdCCForColdCall(F) &&
isValidCandidateForColdCC(F, GetBFI, AllCallsCold))) {
ChangeableCCCache.erase(&F);
F.setCallingConv(CallingConv::Cold);
changeCallSitesToColdCC(&F);
Changed = true;
NumColdCC++;
}
}

Copy link
Contributor Author

@goldsteinn goldsteinn Aug 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, this is the other way around where we are add coldcc to functions. The todo is saying add attribute cold on functions that have coldcc.

dtcxzyw added a commit to dtcxzyw/llvm-opt-benchmark that referenced this pull request Aug 11, 2024
Copy link
Member

@dtcxzyw dtcxzyw left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM if compilation time regressions are acceptable.

But I'm still dubious about the usefulness of inferring cold from coldcc.

if (any_of(*BB, [](Instruction &I) {
if (auto *CB = dyn_cast<CallBase>(&I))
return CB->hasFnAttr(Attribute::Cold);
return false;
Copy link
Contributor

@nikic nikic Aug 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd add a note that this assumes throwing and diverging code paths are cold, which is why this does not check for guaranteed-to-transfer.

@goldsteinn goldsteinn force-pushed the perf/goldsteinn/deduce-cold-attrs branch from d261129 to 08d018f Compare August 18, 2024 22:40
Copy link
Contributor

@nikic nikic left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

I looked at some cases where code size increases with this patch, and the effect is basically that we don't inline an error handling path, and that reduces the size of a function below the inlining threshold, so it gets inlined (a lot more) in turn. Overall effect is undesirable, but the decisions are all locally reasonable.

@goldsteinn
Copy link
Contributor Author

goldsteinn commented Aug 20, 2024

LGTM

I looked at some cases where code size increases with this patch, and the effect is basically that we don't inline an error handling path, and that reduces the size of a function below the inlining threshold, so it gets inlined (a lot more) in turn. Overall effect is undesirable, but the decisions are all locally reasonable.

The stats had less total inlining w/ this patch.

-           27730 inline                       - Number of functions deleted because all callers found
-          106114 inline                       - Number of functions inlined
-          132759 inline-cost                  - Number of call sites analyzed
+           27703 inline                       - Number of functions deleted because all callers found
+          105963 inline                       - Number of functions inlined
+          132883 inline-cost                  - Number of call sites analyzed

Before I push this, Ill add a stat for total number of callsites inlined which AFAICT is: IFI.InlinedCallSites.size() as that is probably a useful metric. Edit: I understand incorrectly. Is NumInlined not tracking total number of callsites?

@goldsteinn
Copy link
Contributor Author

LGTM
I looked at some cases where code size increases with this patch, and the effect is basically that we don't inline an error handling path, and that reduces the size of a function below the inlining threshold, so it gets inlined (a lot more) in turn. Overall effect is undesirable, but the decisions are all locally reasonable.

The stats had less total inlining w/ this patch.

-           27730 inline                       - Number of functions deleted because all callers found
-          106114 inline                       - Number of functions inlined
-          132759 inline-cost                  - Number of call sites analyzed
+           27703 inline                       - Number of functions deleted because all callers found
+          105963 inline                       - Number of functions inlined
+          132883 inline-cost                  - Number of call sites analyzed

Before I push this, Ill add a stat for total number of callsites inlined which AFAICT is: IFI.InlinedCallSites.size() as that is probably a useful metric. Edit: I understand incorrectly. Is NumInlined not tracking total number of callsites?

Put a stat in InlineFunction and see the same trend.

@dyung
Copy link
Collaborator

dyung commented Aug 21, 2024

@goldsteinn your change seems to be causing a test failure on a bot https://lab.llvm.org/buildbot/#/builders/174/builds/3661, can you take a look and revert if you need time to investigate?

@goldsteinn
Copy link
Contributor Author

@goldsteinn your change seems to be causing a test failure on a bot https://lab.llvm.org/buildbot/#/builders/174/builds/3661, can you take a look and revert if you need time to investigate?

Looking into it. I think I just need to update the tests. If its more complicated I will revert.

@goldsteinn
Copy link
Contributor Author

Im a bit perplexed by the failure:

/home/noah/programs/opensource/llvm-dev/src/llvm-project/build/./bin/clang  --driver-mode=g++ -Wall  -m64  /home/noah/programs/opensource/llvm-dev/src/llvm-project/compiler-rt/test/metadata/uar.cpp -O1 -o /home/noah/programs/opensource/llvm-dev/src/llvm-project/build/runtimes/runtimes-bins/compiler-rt/test/metadata/Output/uar.cpp.tmp -fexperimental-sanitize-metadata=covered,uar && /home/noah/programs/opensource/llvm-dev/src/llvm-project/build/runtimes/runtimes-bins/compiler-rt/test/metadata/Output/uar.cpp.tmp | ./bin/FileCheck /home/noah/programs/opensource/llvm-dev/src/llvm-project/compiler-rt/test/metadata/uar.cpp -vv --dump-input-filter=all
/home/noah/programs/opensource/llvm-dev/src/llvm-project/compiler-rt/test/metadata/uar.cpp:77:11: error: CHECK: expected string not found in input
// CHECK: with_noreturn_tail_call: features=0
          ^
<stdin>:12:27: note: scanning from here
with_tail_call: features=2 stack_args=0
                          ^
<stdin>:13:15: note: possible intended match here
local_array: features=0 stack_args=0
              ^

Input file: <stdin>
Check file: /home/noah/programs/opensource/llvm-dev/src/llvm-project/compiler-rt/test/metadata/uar.cpp

-dump-input=help explains the following input dump.

Input was:
<<<<<<
            1: metadata add version 2 
check:5        ^~~~~~~~~~~~~~~~~~~~~~
            2: with_noreturn_tail_call: features=0 stack_args=0 
            3: empty: features=0 stack_args=0 
check:17       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            4: simple: features=0 stack_args=0 
check:20       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            5: builtins: features=0 stack_args=0 
check:23       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            6: ellipsis: features=0 stack_args=0 
check:30       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            7: non_empty_function: features=2 stack_args=0 
check:36       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            8: no_stack_args: features=2 stack_args=0 
check:42       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            9: stack_args: features=6 stack_args=16 
check:48       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           10: more_stack_args: features=6 stack_args=32 
check:54       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           11: struct_stack_args: features=6 stack_args=144 
check:61       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           12: with_tail_call: features=2 stack_args=0 
check:72       ^~~~~~~~~~~~~~~~~~~~~~~~~~
check:77'0                               X~~~~~~~~~~~~~ error: no match found
           13: local_array: features=0 stack_args=0 
check:77'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
check:77'1                   ?                       possible intended match
           14: local_alloca: features=0 stack_args=0 
check:77'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           15: escaping_alloca: features=2 stack_args=0 
check:77'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           16: main 
check:77'0     ~~~~~
>>>>>>

Although AFAICT all the outputs match...:

/home/noah/programs/opensource/llvm-dev/src/llvm-project/build/./bin/clang  --driver-mode=g++ -Wall  -m64  /home/noah/programs/opensource/llvm-dev/src/llvm-project/compiler-rt/test/metadata/uar.cpp -O1 -o /home/noah/programs/opensource/llvm-dev/src/llvm-project/build/runtimes/runtimes-bins/compiler-rt/test/metadata/Output/uar.cpp.tmp -fexperimental-sanitize-metadata=covered,uar && /home/noah/programs/opensource/llvm-dev/src/llvm-project/build/runtimes/runtimes-bins/compiler-rt/test/metadata/Output/uar.cpp.tmp
metadata add version 2
with_noreturn_tail_call: features=0 stack_args=0
empty: features=0 stack_args=0
simple: features=0 stack_args=0
builtins: features=0 stack_args=0
ellipsis: features=0 stack_args=0
non_empty_function: features=2 stack_args=0
no_stack_args: features=2 stack_args=0
stack_args: features=6 stack_args=16
more_stack_args: features=6 stack_args=32
struct_stack_args: features=6 stack_args=144
with_tail_call: features=2 stack_args=0
local_array: features=0 stack_args=0
local_alloca: features=0 stack_args=0
escaping_alloca: features=2 stack_args=0
main

Ill revert to unblock the pipeline while i figure this out.

@goldsteinn goldsteinn reopened this Aug 21, 2024
@dyung
Copy link
Collaborator

dyung commented Aug 21, 2024

Although AFAICT all the outputs match...:

/home/noah/programs/opensource/llvm-dev/src/llvm-project/build/./bin/clang  --driver-mode=g++ -Wall  -m64  /home/noah/programs/opensource/llvm-dev/src/llvm-project/compiler-rt/test/metadata/uar.cpp -O1 -o /home/noah/programs/opensource/llvm-dev/src/llvm-project/build/runtimes/runtimes-bins/compiler-rt/test/metadata/Output/uar.cpp.tmp -fexperimental-sanitize-metadata=covered,uar && /home/noah/programs/opensource/llvm-dev/src/llvm-project/build/runtimes/runtimes-bins/compiler-rt/test/metadata/Output/uar.cpp.tmp
metadata add version 2
with_noreturn_tail_call: features=0 stack_args=0
empty: features=0 stack_args=0
simple: features=0 stack_args=0
builtins: features=0 stack_args=0
ellipsis: features=0 stack_args=0
non_empty_function: features=2 stack_args=0
no_stack_args: features=2 stack_args=0
stack_args: features=6 stack_args=16
more_stack_args: features=6 stack_args=32
struct_stack_args: features=6 stack_args=144
with_tail_call: features=2 stack_args=0
local_array: features=0 stack_args=0
local_alloca: features=0 stack_args=0
escaping_alloca: features=2 stack_args=0
main

Ill revert to unblock the pipeline while i figure this out.

It appears one CHECK line no longer appears in the output:

// CHECK: with_noreturn_tail_call: features=0

@goldsteinn
Copy link
Contributor Author

Although AFAICT all the outputs match...:

/home/noah/programs/opensource/llvm-dev/src/llvm-project/build/./bin/clang  --driver-mode=g++ -Wall  -m64  /home/noah/programs/opensource/llvm-dev/src/llvm-project/compiler-rt/test/metadata/uar.cpp -O1 -o /home/noah/programs/opensource/llvm-dev/src/llvm-project/build/runtimes/runtimes-bins/compiler-rt/test/metadata/Output/uar.cpp.tmp -fexperimental-sanitize-metadata=covered,uar && /home/noah/programs/opensource/llvm-dev/src/llvm-project/build/runtimes/runtimes-bins/compiler-rt/test/metadata/Output/uar.cpp.tmp
metadata add version 2
with_noreturn_tail_call: features=0 stack_args=0
empty: features=0 stack_args=0
simple: features=0 stack_args=0
builtins: features=0 stack_args=0
ellipsis: features=0 stack_args=0
non_empty_function: features=2 stack_args=0
no_stack_args: features=2 stack_args=0
stack_args: features=6 stack_args=16
more_stack_args: features=6 stack_args=32
struct_stack_args: features=6 stack_args=144
with_tail_call: features=2 stack_args=0
local_array: features=0 stack_args=0
local_alloca: features=0 stack_args=0
escaping_alloca: features=2 stack_args=0
main

Ill revert to unblock the pipeline while i figure this out.

It appears one CHECK line no longer appears in the output:

// CHECK: with_noreturn_tail_call: features=0

It's second line of the output though...

@goldsteinn
Copy link
Contributor Author

Posted: #105559 with the recommit.

cjdb pushed a commit to cjdb/llvm-project that referenced this pull request Aug 23, 2024
dmpolukhin pushed a commit to dmpolukhin/llvm-project that referenced this pull request Sep 2, 2024
@nikic
Copy link
Contributor

nikic commented Sep 5, 2024

Closing this PR as the recommit landed in #105559.

@nikic nikic closed this Sep 5, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

6 participants