Skip to content

Commit

Permalink
[compiler-rt][ctx_profile] Add the instrumented contextual profiling …
Browse files Browse the repository at this point in the history
…APIs

APIs for contextual profiling.

(Tracking Issue: llvm#89287, RFC referenced there)
  • Loading branch information
mtrofin committed Apr 23, 2024
1 parent 4e9decf commit 75161d4
Show file tree
Hide file tree
Showing 3 changed files with 483 additions and 0 deletions.
213 changes: 213 additions & 0 deletions compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,76 @@
#include "sanitizer_common/sanitizer_mutex.h"
#include "sanitizer_common/sanitizer_placement_new.h"
#include "sanitizer_common/sanitizer_thread_safety.h"
#include "sanitizer_common/sanitizer_vector.h"

#include <assert.h>

using namespace __ctx_profile;

namespace {
__sanitizer::SpinMutex AllContextsMutex;
SANITIZER_GUARDED_BY(AllContextsMutex)
__sanitizer::Vector<ContextRoot *> AllContextRoots;

ContextNode *markAsScratch(const ContextNode *Ctx) {
return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1);
}

template <typename T> T consume(T &V) {
auto R = V;
V = {0};
return R;
}

constexpr size_t kPower = 20;
constexpr size_t kBuffSize = 1 << kPower;

size_t getArenaAllocSize(size_t Needed) {
if (Needed >= kBuffSize)
return 2 * Needed;
return kBuffSize;
}

bool validate(const ContextRoot *Root) {
__sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs;
for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
const auto *Pos = Mem->start();
while (Pos < Mem->pos()) {
const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
if (!ContextStartAddrs.insert({reinterpret_cast<uint64_t>(Ctx), true})
.second)
return false;
Pos += Ctx->size();
}
}

for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
const auto *Pos = Mem->start();
while (Pos < Mem->pos()) {
const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
for (uint32_t I = 0; I < Ctx->callsites_size(); ++I)
for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next())
if (!ContextStartAddrs.find(reinterpret_cast<uint64_t>(Sub)))
return false;

Pos += Ctx->size();
}
}
return true;
}
} // namespace

__thread char __Buffer[kBuffSize] = {0};

#define TheScratchContext \
markAsScratch(reinterpret_cast<ContextNode *>(__Buffer))
__thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr,
nullptr};
__thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0};

__thread ContextRoot *volatile __llvm_ctx_profile_current_context_root =
nullptr;

// FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
// the dependency on the latter.
Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) {
Expand All @@ -38,3 +103,151 @@ void Arena::freeArenaList(Arena *&A) {
}
A = nullptr;
}

inline ContextNode *ContextNode::alloc(char *Place, GUID Guid,
uint32_t NrCounters,
uint32_t NrCallsites,
ContextNode *Next) {
return new (Place) ContextNode(Guid, NrCounters, NrCallsites, Next);
}

void ContextNode::reset() {
for (uint32_t I = 0; I < NrCounters; ++I)
counters()[I] = 0;
for (uint32_t I = 0; I < NrCallsites; ++I)
for (auto *Next = subContexts()[I]; Next; Next = Next->Next)
Next->reset();
}

ContextNode *getCallsiteSlow(uint64_t Guid, ContextNode **InsertionPoint,
uint32_t NrCounters, uint32_t NrCallsites) {
auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites);
auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem;
char *AllocPlace = Mem->tryBumpAllocate(AllocSize);
if (!AllocPlace) {
__llvm_ctx_profile_current_context_root->CurrentMem = Mem =
Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem);
}
auto *Ret = ContextNode::alloc(AllocPlace, Guid, NrCounters, NrCallsites,
*InsertionPoint);
*InsertionPoint = Ret;
return Ret;
}

ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
uint32_t NrCounters,
uint32_t NrCallsites) {
if (!__llvm_ctx_profile_current_context_root) {
return TheScratchContext;
}
auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
if (!CallsiteContext || isScratch(*CallsiteContext))
return TheScratchContext;

auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
if (ExpectedCallee != Callee)
return TheScratchContext;

auto *Callsite = *CallsiteContext;
while (Callsite && Callsite->guid() != Guid) {
Callsite = Callsite->next();
}
auto *Ret = Callsite ? Callsite
: getCallsiteSlow(Guid, CallsiteContext, NrCounters,
NrCallsites);
if (Ret->callsites_size() != NrCallsites ||
Ret->counters_size() != NrCounters)
__sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: "
"Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n",
Ret, Guid, NrCallsites, NrCounters, Ret->guid(),
Ret->callsites_size(), Ret->counters_size());
Ret->onEntry();
return Ret;
}

void setupContext(ContextRoot *Root, GUID Guid, uint32_t NrCounters,
uint32_t NrCallsites) {
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
&AllContextsMutex);
// Re-check - we got here without having had taken a lock.
if (Root->FirstMemBlock)
return;
const auto Needed = ContextNode::getAllocSize(NrCounters, NrCallsites);
auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed));
Root->FirstMemBlock = M;
Root->CurrentMem = M;
Root->FirstNode = ContextNode::alloc(M->tryBumpAllocate(Needed), Guid,
NrCounters, NrCallsites);
AllContextRoots.PushBack(Root);
}

ContextNode *__llvm_ctx_profile_start_context(
ContextRoot *Root, GUID Guid, uint32_t Counters,
uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
if (!Root->FirstMemBlock) {
setupContext(Root, Guid, Counters, Callsites);
}
if (Root->Taken.TryLock()) {
__llvm_ctx_profile_current_context_root = Root;
Root->FirstNode->onEntry();
return Root->FirstNode;
}
__llvm_ctx_profile_current_context_root = nullptr;
return TheScratchContext;
}

void __llvm_ctx_profile_release_context(ContextRoot *Root)
SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
if (__llvm_ctx_profile_current_context_root) {
__llvm_ctx_profile_current_context_root = nullptr;
Root->Taken.Unlock();
}
}

void __llvm_ctx_profile_start_collection() {
size_t NrMemUnits = 0;
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
&AllContextsMutex);
for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) {
auto *Root = AllContextRoots[I];
__sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock(
&Root->Taken);
for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next())
++NrMemUnits;

Root->FirstNode->reset();
}
__sanitizer::Printf("[ctxprof] Initial NrMemUnits: %zu \n", NrMemUnits);
}

bool __llvm_ctx_profile_fetch(
void *Data, bool (*Writer)(void *W, const __ctx_profile::ContextNode &)) {
assert(Writer);
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
&AllContextsMutex);

for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) {
auto *Root = AllContextRoots[I];
__sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock(
&Root->Taken);
if (!validate(Root)) {
__sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid");
return false;
}
if (!Writer(Data, *Root->FirstNode))
return false;
}
return true;
}

void __llvm_ctx_profile_free() {
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
&AllContextsMutex);
for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
auto *C = A;
A = A->next();
__sanitizer::InternalFree(C);
}
AllContextRoots.Reset();
}
116 changes: 116 additions & 0 deletions compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
#ifndef CTX_PROFILE_CTXINSTRPROFILING_H_
#define CTX_PROFILE_CTXINSTRPROFILING_H_

#include "sanitizer_common/sanitizer_mutex.h"
#include <sanitizer/common_interface_defs.h>

namespace __ctx_profile {
using GUID = uint64_t;

/// Arena (bump allocator) forming a linked list. Intentionally not thread safe.
/// Allocation and de-allocation happen using sanitizer APIs. We make that
Expand Down Expand Up @@ -51,5 +53,119 @@ class Arena final {
const uint64_t Size;
};

class ContextNode final {
const GUID Guid;
ContextNode *const Next;
const uint32_t NrCounters;
const uint32_t NrCallsites;

public:
ContextNode(GUID Guid, uint32_t NrCounters, uint32_t NrCallsites,
ContextNode *Next = nullptr)
: Guid(Guid), Next(Next), NrCounters(NrCounters),
NrCallsites(NrCallsites) {}
static inline ContextNode *alloc(char *Place, GUID Guid, uint32_t NrCounters,
uint32_t NrCallsites,
ContextNode *Next = nullptr);

static inline size_t getAllocSize(uint32_t NrCounters, uint32_t NrCallsites) {
return sizeof(ContextNode) + sizeof(uint64_t) * NrCounters +
sizeof(ContextNode *) * NrCallsites;
}

uint64_t *counters() {
ContextNode *addr_after = &(this[1]);
return reinterpret_cast<uint64_t *>(reinterpret_cast<char *>(addr_after));
}

uint32_t counters_size() const { return NrCounters; }
uint32_t callsites_size() const { return NrCallsites; }

const uint64_t *counters() const {
return const_cast<ContextNode *>(this)->counters();
}

ContextNode **subContexts() {
return reinterpret_cast<ContextNode **>(&(counters()[NrCounters]));
}

ContextNode *const *subContexts() const {
return const_cast<ContextNode *>(this)->subContexts();
}

GUID guid() const { return Guid; }
ContextNode *next() { return Next; }

size_t size() const { return getAllocSize(NrCounters, NrCallsites); }

void reset();

void onEntry() { ++counters()[0]; }

uint64_t entrycount() const { return counters()[0]; }
};

/// ContextRoots are allocated by LLVM for entrypoints. The main concern is
/// the total size, LLVM doesn't actually dereference members.
struct ContextRoot {
ContextNode *FirstNode = nullptr;
Arena *FirstMemBlock = nullptr;
Arena *CurrentMem = nullptr;
// This is init-ed by the static zero initializer in LLVM.
::__sanitizer::StaticSpinMutex Taken;

// Avoid surprises due to (unlikely) StaticSpinMutex changes.
static_assert(sizeof(Taken) == 1);
};

/// This API is exposed for testing.
inline bool isScratch(const ContextNode *Ctx) {
return (reinterpret_cast<uint64_t>(Ctx) & 1);
}

} // namespace __ctx_profile

extern "C" {

// LLVM fills these in when lowering a llvm.instrprof.callsite intrinsic.
// position 0 is used when the current context isn't scratch, 1 when it is.
extern __thread void *volatile __llvm_ctx_profile_expected_callee[2];
extern __thread __ctx_profile::ContextNode *
*volatile __llvm_ctx_profile_callsite[2];

// __llvm_ctx_profile_current_context_root is exposed for unit testing,
// othwerise it's only used internally.
extern __thread __ctx_profile::ContextRoot
*volatile __llvm_ctx_profile_current_context_root;

/// called by LLVM in the entry BB of a "entry point" function. The returned
/// pointer may be "tainted" - its LSB set to 1 - to indicate it's scratch.
__ctx_profile::ContextNode *
__llvm_ctx_profile_start_context(__ctx_profile::ContextRoot *Root,
__ctx_profile::GUID Guid, uint32_t Counters,
uint32_t Callsites);

/// paired with __llvm_ctx_profile_start_context, and called at the exit of the
/// entry point function.
void __llvm_ctx_profile_release_context(__ctx_profile::ContextRoot *Root);

/// called for any other function than entry points, in the entry BB of such
/// function. Same consideration about LSB of returned value as .._start_context
__ctx_profile::ContextNode *
__llvm_ctx_profile_get_context(void *Callee, __ctx_profile::GUID Guid,
uint32_t NrCounters, uint32_t NrCallsites);

/// Prepares for collection. Currently this resets counter values but preserves
/// internal structure.
void __llvm_ctx_profile_start_collection();

/// Completely free allocated memory.
void __llvm_ctx_profile_free();

/// Used to obtain the profile. The Writer is called for each root ContextNode,
/// with the ContextRoot::Taken taken. The Writer is responsible for traversing
/// the structure underneath.
bool __llvm_ctx_profile_fetch(
void *Data, bool (*Writer)(void *, const __ctx_profile::ContextNode &));
}
#endif // CTX_PROFILE_CTXINSTRPROFILING_H_
Loading

0 comments on commit 75161d4

Please sign in to comment.