diff --git a/src/OpenTelemetry.AutoInstrumentation.Native/CMakeLists.txt b/src/OpenTelemetry.AutoInstrumentation.Native/CMakeLists.txt index 535dcc10b3..28593fc970 100644 --- a/src/OpenTelemetry.AutoInstrumentation.Native/CMakeLists.txt +++ b/src/OpenTelemetry.AutoInstrumentation.Native/CMakeLists.txt @@ -168,6 +168,8 @@ endif() add_library("OpenTelemetry.AutoInstrumentation.Native.static" STATIC class_factory.cpp clr_helpers.cpp + continuous_profiler_clr_helpers.cpp + continuous_profiler.cpp cor_profiler_base.cpp cor_profiler.cpp il_rewriter_wrapper.cpp diff --git a/src/OpenTelemetry.AutoInstrumentation.Native/OpenTelemetry.AutoInstrumentation.Native.def b/src/OpenTelemetry.AutoInstrumentation.Native/OpenTelemetry.AutoInstrumentation.Native.def index 09ba4f7371..8b60f1af11 100644 --- a/src/OpenTelemetry.AutoInstrumentation.Native/OpenTelemetry.AutoInstrumentation.Native.def +++ b/src/OpenTelemetry.AutoInstrumentation.Native/OpenTelemetry.AutoInstrumentation.Native.def @@ -7,3 +7,7 @@ EXPORTS IsProfilerAttached AddInstrumentations AddDerivedInstrumentations + ConfigureContinuousProfiler + ContinuousProfilerReadThreadSamples + ContinuousProfilerReadAllocationSamples + ContinuousProfilerSetNativeContext diff --git a/src/OpenTelemetry.AutoInstrumentation.Native/OpenTelemetry.AutoInstrumentation.Native.vcxproj b/src/OpenTelemetry.AutoInstrumentation.Native/OpenTelemetry.AutoInstrumentation.Native.vcxproj index d5d1c3f994..6953fc9394 100644 --- a/src/OpenTelemetry.AutoInstrumentation.Native/OpenTelemetry.AutoInstrumentation.Native.vcxproj +++ b/src/OpenTelemetry.AutoInstrumentation.Native/OpenTelemetry.AutoInstrumentation.Native.vcxproj @@ -186,6 +186,8 @@ + + @@ -220,6 +222,8 @@ + + diff --git a/src/OpenTelemetry.AutoInstrumentation.Native/clr_helpers.cpp b/src/OpenTelemetry.AutoInstrumentation.Native/clr_helpers.cpp index 77379e8c54..3d9d10ab28 100644 --- a/src/OpenTelemetry.AutoInstrumentation.Native/clr_helpers.cpp +++ b/src/OpenTelemetry.AutoInstrumentation.Native/clr_helpers.cpp @@ -487,6 +487,13 @@ WSTRING GetSigTypeTokName(PCCOR_SIGNATURE& pbCur, const ComPtr ref_flag = true; } + bool pointer_flag = false; + if (*pbCur == ELEMENT_TYPE_PTR) + { + pbCur++; + pointer_flag = true; + } + switch (*pbCur) { case ELEMENT_TYPE_BOOLEAN: @@ -610,6 +617,12 @@ WSTRING GetSigTypeTokName(PCCOR_SIGNATURE& pbCur, const ComPtr { tokenName += WStr("&"); } + + if (pointer_flag) + { + tokenName += WStr("*"); + } + return tokenName; } @@ -856,6 +869,9 @@ bool ParseParamOrLocal(PCCOR_SIGNATURE& pbCur, PCCOR_SIGNATURE pbEnd) if (*pbCur == ELEMENT_TYPE_BYREF) pbCur++; + if (*pbCur == ELEMENT_TYPE_PTR) + pbCur++; + return ParseType(pbCur, pbEnd); } diff --git a/src/OpenTelemetry.AutoInstrumentation.Native/clr_helpers.h b/src/OpenTelemetry.AutoInstrumentation.Native/clr_helpers.h index af9ec64aa9..82fb47043a 100644 --- a/src/OpenTelemetry.AutoInstrumentation.Native/clr_helpers.h +++ b/src/OpenTelemetry.AutoInstrumentation.Native/clr_helpers.h @@ -629,6 +629,28 @@ HRESULT GetCorLibAssemblyRef(const ComPtr& assembly_emit, bool FindTypeDefByName(const trace::WSTRING instrumentationTargetMethodTypeName, const trace::WSTRING assemblyName, const ComPtr& metadata_import, mdTypeDef& typeDef); + +// FunctionMethodSignature +bool ParseByte(PCCOR_SIGNATURE& pbCur, PCCOR_SIGNATURE pbEnd, unsigned char* pbOut); +bool ParseNumber(PCCOR_SIGNATURE& pbCur, PCCOR_SIGNATURE pbEnd, unsigned* pOut); +bool ParseTypeDefOrRefEncoded(PCCOR_SIGNATURE& pbCur, PCCOR_SIGNATURE pbEnd, unsigned char* pIndexTypeOut, + unsigned* pIndexOut); + +/* we don't support + PTR CustomMod* VOID + PTR CustomMod* Type + FNPTR MethodDefSig + FNPTR MethodRefSig + ARRAY Type ArrayShape + SZARRAY CustomMod+ Type (but we do support SZARRAY Type) + */ +bool ParseType(PCCOR_SIGNATURE& pbCur, PCCOR_SIGNATURE pbEnd); +// Param ::= CustomMod* ( TYPEDBYREF | [BYREF] Type ) +// CustomMod* TYPEDBYREF we don't support +bool ParseParamOrLocal(PCCOR_SIGNATURE& pbCur, PCCOR_SIGNATURE pbEnd); +// RetType ::= CustomMod* ( VOID | TYPEDBYREF | [BYREF] Type ) +// CustomMod* TYPEDBYREF we don't support +bool ParseRetType(PCCOR_SIGNATURE& pbCur, PCCOR_SIGNATURE pbEnd); } // namespace trace #endif // OTEL_CLR_PROFILER_CLR_HELPERS_H_ diff --git a/src/OpenTelemetry.AutoInstrumentation.Native/continuous_profiler.cpp b/src/OpenTelemetry.AutoInstrumentation.Native/continuous_profiler.cpp new file mode 100644 index 0000000000..ad4e30b071 --- /dev/null +++ b/src/OpenTelemetry.AutoInstrumentation.Native/continuous_profiler.cpp @@ -0,0 +1,1076 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +// We want to use std::min, not the windows.h macro +#define NOMINMAX +#include "continuous_profiler.h" +#include "logger.h" +#include +#include +#include +#include +#ifndef _WIN32 +#include +#include +#endif + +constexpr auto kMaxStringLength = 512UL; + +constexpr auto kMaxCodesPerBuffer = 10 * 1000; + +// If you change this, consider ThreadSampler.cs too +constexpr auto kSamplesBufferMaximumSize = 200 * 1024; + +constexpr auto kSamplesBufferDefaultSize = 20 * 1024; + +// If you change these, change ThreadSampler.cs too +constexpr auto kDefaultSamplePeriod = 10000; +constexpr auto kMinimumSamplePeriod = 1000; + +constexpr auto kDefaultMaxAllocsPerMinute = 200; + +// FIXME make configurable (hidden)? +// These numbers were chosen to keep total overhead under 1 MB of RAM in typical cases (name lengths being the biggest +// variable) +constexpr auto kMaxFunctionNameCacheSize = 5000; +constexpr auto kMaxVolatileFunctionNameCacheSize = 2000; + +// If you squint you can make out that the original bones of this came from sample code provided by the dotnet project: +// https://github.com/dotnet/samples/blob/2cf486af936261b04a438ea44779cdc26c613f98/core/profiling/stacksampling/src/sampler.cpp +// That stack sampling project is worth reading for a simpler (though higher overhead) take on thread sampling. + +/* + Locking/threading design: + We have the following shared data structures: + - A buffer for captured thread samples, used by a single writing thread and a single (managed) reading one + - A buffer for captured allocation samples, used by any application thread at any time + - a name cache (data structure for building humnan-readable stack traces), used during both thread and + allocation sampling + - a cache of thread id->thread name (set by each thread itself, used during either sample type) + - a cache of thread span context state (set by any application thread at any time, used during either sample type) + + In general we want to keep locks "adjacent" to just one data structure and usage of them local to one + modifying/reading method to simplify analysis. However, there are some special cases. + Here are the locks in use: + - cpu_buffer_lock guarding access to the thread samples buffer + - allocation_buffer_lock guarding access to the buffer for allocation samples + - name_cache_lock, guarding the data structures used for function/class name lookup + - thread_state_lock_ guarding the thread name map + - thread_span_context_lock guarding that data structure + - (special) a profiling_lock so only one type of profiling (thread stacks or allocation sample) runs at a time + + The special cases worth calling out about locking behavior are: + - Because the stack sampler pauses the CLR, to avoid deadlock it needs to know that no application thread + is holding a lock it needs (e.g., thread_span_context_lock) while it is paused. So, it acquires + all the necessary locks at once before pausing the CLR. + - Because the thread sampler pauses the whole CLR and then proceeds to walk every + thread's stack (using the name cache for each stack entry), it obviously needs the + name_cache_lock but we don't want to pay the overhead of locking/unlocking it in that tight loop. + So, it is acquired once before the iteration of threads start, and unlike other locks, + the methods for this are specifically coded assuming you own + - AllocationTick(s - possibly on multiple threads) and the thread sampler can happen concurrently, + so the profiling_lock is used with unique_lock and shared_lock to ensure that any allocation samples + are fully processed before pausing the CLR for thread samples. +*/ + +static std::mutex cpu_buffer_lock = std::mutex(); +static std::vector* cpu_buffer_a; +static std::vector* cpu_buffer_b; + +static std::mutex allocation_buffer_lock = std::mutex(); +static std::vector* allocation_buffer = new std::vector(); + +static std::mutex thread_span_context_lock; +static std::unordered_map thread_span_context_map; + +static std::mutex name_cache_lock = std::mutex(); + +static std::shared_mutex profiling_lock = std::shared_mutex(); + +static ICorProfilerInfo12* profiler_info; // After feature sets settle down, perhaps this should be refactored and have + // a single static instance of ThreadSampler + +// Dirt-simple back pressure system to save overhead if managed code is not reading fast enough +bool ThreadSamplingShouldProduceThreadSample() +{ + std::lock_guard guard(cpu_buffer_lock); + return cpu_buffer_a == nullptr || cpu_buffer_b == nullptr; +} +void ThreadSamplingRecordProducedThreadSample(std::vector* buf) +{ + std::lock_guard guard(cpu_buffer_lock); + if (cpu_buffer_a == nullptr) + { + cpu_buffer_a = buf; + } + else if (cpu_buffer_b == nullptr) + { + cpu_buffer_b = buf; + } + else + { + trace::Logger::Warn("Unexpected buffer drop in ThreadSampling_RecordProducedThreadSample"); + delete buf; // needs to be dropped now + } +} +// Can return 0 if none are pending +int32_t ThreadSamplingConsumeOneThreadSample(int32_t len, unsigned char* buf) +{ + if (len <= 0 || buf == nullptr) + { + trace::Logger::Warn("Unexpected 0/null buffer to ThreadSampling_ConsumeOneThreadSample"); + return 0; + } + std::vector* to_use = nullptr; + { + std::lock_guard guard(cpu_buffer_lock); + if (cpu_buffer_a != nullptr) + { + to_use = cpu_buffer_a; + cpu_buffer_a = nullptr; + } + else if (cpu_buffer_b != nullptr) + { + to_use = cpu_buffer_b; + cpu_buffer_b = nullptr; + } + } + if (to_use == nullptr) + { + return 0; + } + const size_t to_use_len = static_cast(std::min(to_use->size(), static_cast(len))); + memcpy(buf, to_use->data(), to_use_len); + delete to_use; + return static_cast(to_use_len); +} + +void AllocationSamplingAppendToBuffer(int32_t appendLen, unsigned char* appendBuf) +{ + if (appendLen <= 0 || appendBuf == NULL) + { + return; + } + std::lock_guard guard(allocation_buffer_lock); + + if (allocation_buffer->size() + appendLen >= kSamplesBufferMaximumSize) + { + trace::Logger::Warn("Discarding captured allocation sample. Allocation buffer is full."); + return; + } + allocation_buffer->insert(allocation_buffer->end(), appendBuf, &appendBuf[appendLen]); +} + +// Can return 0 +int32_t AllocationSamplingConsumeAndReplaceBuffer(int32_t len, unsigned char* buf) +{ + if (len <= 0 || buf == nullptr) + { + trace::Logger::Warn("Unexpected 0/null buffer to ContinuousProfilerReadAllocationSamples"); + return 0; + } + std::vector* to_use = nullptr; + { + std::lock_guard guard(allocation_buffer_lock); + to_use = allocation_buffer; + allocation_buffer = new std::vector(); + allocation_buffer->reserve(kSamplesBufferDefaultSize); + } + if (to_use == nullptr) + { + return 0; + } + const size_t to_use_len = static_cast(std::min(to_use->size(), static_cast(len))); + memcpy(buf, to_use->data(), to_use_len); + delete to_use; + return static_cast(to_use_len); +} + +namespace continuous_profiler +{ + +/* +* The thread samples buffer format is optimized for single-pass and efficient writing by the native sampling thread +* (which +* has paused the CLR) +* +* It uses a simple byte-opcode format with fairly standard binary encoding of values. It is entirely positional but is +* at least versioned +* so that mismatched components (native writer and managed reader) will not emit nonsense. +* +* ints, shorts, and 64-bit longs are written in big-endian format; strings are written as 2-byte-length-prefixed +* standard windows utf-16 strings +* +* I would write out the "spec" for this format here, but it essentially maps to the code +* (e.g., 0x01 is StartBatch, which is followed by an int versionNumber and a long captureStartTimeInMillis) +* +* The bulk of the data is an (unknown length) array of frame strings, which are represented as coded strings in each +* buffer. +* Each used string is given a code (starting at 1) - using an old old inline trick, codes are introduced by writing the +* code as a +* negative number followed by the definition of the string (length-prefixed) that maps to that code. Later uses of the +* code +* simply use the 2-byte (positive) code, meaning frequently used strings will take only 2 bytes apiece. 0 is reserved +* for "end of list" +* since the number of frames is not known up-front. +* +* Each buffer can be parsed/decoded independently; the codes and the LRU NameCache are not related. +*/ + +// defined op codes +constexpr auto kThreadSamplesStartBatch = 0x01; +constexpr auto kThreadSamplesStartSample = 0x02; +constexpr auto kThreadSamplesEndBatch = 0x06; +constexpr auto kThreadSamplesFinalStats = 0x07; +constexpr auto kAllocationSample = 0x08; + +constexpr auto kCurrentThreadSamplesBufferVersion = 1; + +continuous_profiler::ThreadSamplesBuffer::ThreadSamplesBuffer(std::vector* buf) : buffer_(buf) +{ +} +ThreadSamplesBuffer::~ThreadSamplesBuffer() +{ + buffer_ = nullptr; // specifically don't delete as this is done by RecordProduced/ConsumeOneThreadSample +} + +#define CHECK_SAMPLES_BUFFER_LENGTH() \ + { \ + if (buffer_->size() >= kSamplesBufferMaximumSize) \ + { \ + return; \ + } \ + } + +void ThreadSamplesBuffer::StartBatch() const +{ + CHECK_SAMPLES_BUFFER_LENGTH() + WriteByte(kThreadSamplesStartBatch); + WriteInt(kCurrentThreadSamplesBufferVersion); + WriteCurrentTimeMillis(); +} + +void ThreadSamplesBuffer::StartSample(ThreadID id, + const ThreadState* state, + const thread_span_context& span_context) const +{ + CHECK_SAMPLES_BUFFER_LENGTH() + WriteByte(kThreadSamplesStartSample); + WriteInt(span_context.managed_thread_id_); + WriteString(state->thread_name_); + WriteUInt64(span_context.trace_id_high_); + WriteUInt64(span_context.trace_id_low_); + WriteUInt64(span_context.span_id_); + // Feature possibilities: (managed/native) thread priority, cpu/wait times, etc. +} + +void ThreadSamplesBuffer::AllocationSample(uint64_t allocSize, + const WCHAR* allocType, + size_t allocTypeCharLen, + ThreadID id, + const ThreadState* state, + const thread_span_context& span_context) const + +{ + CHECK_SAMPLES_BUFFER_LENGTH() + WriteByte(kAllocationSample); + WriteCurrentTimeMillis(); + WriteUInt64(allocSize); + WriteString(allocType, allocTypeCharLen); + WriteInt(span_context.managed_thread_id_); + WriteString(state->thread_name_); + WriteUInt64(span_context.trace_id_high_); + WriteUInt64(span_context.trace_id_low_); + WriteUInt64(span_context.span_id_); +} + +void ThreadSamplesBuffer::RecordFrame(FunctionID fid, const trace::WSTRING& frame) +{ + CHECK_SAMPLES_BUFFER_LENGTH() + WriteCodedFrameString(fid, frame); +} +void ThreadSamplesBuffer::EndSample() const +{ + CHECK_SAMPLES_BUFFER_LENGTH() + WriteShort(0); +} +void ThreadSamplesBuffer::EndBatch() const +{ + CHECK_SAMPLES_BUFFER_LENGTH() + WriteByte(kThreadSamplesEndBatch); +} +void ThreadSamplesBuffer::WriteFinalStats(const SamplingStatistics& stats) const +{ + CHECK_SAMPLES_BUFFER_LENGTH() + WriteByte(kThreadSamplesFinalStats); + WriteInt(stats.micros_suspended); + WriteInt(stats.num_threads); + WriteInt(stats.total_frames); + WriteInt(stats.name_cache_misses); +} + +void ThreadSamplesBuffer::WriteCodedFrameString(FunctionID fid, const trace::WSTRING& str) +{ + const auto found = codes_.find(fid); + if (found != codes_.end()) + { + WriteShort(static_cast(found->second)); + } + else + { + const int code = static_cast(codes_.size()) + 1; + if (codes_.size() + 1 < kMaxCodesPerBuffer) + { + codes_[fid] = code; + } + WriteShort(static_cast(-code)); // note negative sign indicating definition of code + WriteString(str); + } +} +void ThreadSamplesBuffer::WriteShort(int16_t val) const +{ + buffer_->push_back(((val >> 8) & 0xFF)); + buffer_->push_back(val & 0xFF); +} +void ThreadSamplesBuffer::WriteInt(int32_t val) const +{ + buffer_->push_back(((val >> 24) & 0xFF)); + buffer_->push_back(((val >> 16) & 0xFF)); + buffer_->push_back(((val >> 8) & 0xFF)); + buffer_->push_back(val & 0xFF); +} + +void ThreadSamplesBuffer::WriteString(const WCHAR* s, size_t charLen) const +{ + // limit strings to a max length overall; this prevents (e.g.) thread names or + // any other miscellaneous strings that come along from blowing things out + const short used_len = static_cast(std::min(charLen, static_cast(kMaxStringLength))); + WriteShort(used_len); + // odd bit of casting since we're copying bytes, not wchars + const auto str_begin = reinterpret_cast(s); + // possible endian-ness assumption here; unclear how the managed layer would decode on big endian platforms + buffer_->insert(buffer_->end(), str_begin, str_begin + used_len * 2); +} + +void ThreadSamplesBuffer::WriteString(const trace::WSTRING& str) const +{ + WriteString(str.c_str(), str.length()); +} +void ThreadSamplesBuffer::WriteByte(unsigned char b) const +{ + buffer_->push_back(b); +} +void ThreadSamplesBuffer::WriteUInt64(uint64_t val) const +{ + buffer_->push_back(((val >> 56) & 0xFF)); + buffer_->push_back(((val >> 48) & 0xFF)); + buffer_->push_back(((val >> 40) & 0xFF)); + buffer_->push_back(((val >> 32) & 0xFF)); + buffer_->push_back(((val >> 24) & 0xFF)); + buffer_->push_back(((val >> 16) & 0xFF)); + buffer_->push_back(((val >> 8) & 0xFF)); + buffer_->push_back(val & 0xFF); +} + +void ThreadSamplesBuffer::WriteCurrentTimeMillis() const +{ + const auto ms = + std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()); + WriteUInt64(ms.count()); +} + +NamingHelper::NamingHelper() + : function_name_cache_(kMaxFunctionNameCacheSize, nullptr) + , volatile_function_name_cache_(kMaxVolatileFunctionNameCacheSize, + std::pair(nullptr, {})) +{ +} + +bool ContinuousProfiler::AllocateBuffer() +{ + const bool should = ThreadSamplingShouldProduceThreadSample(); + if (!should) + { + return should; + } + stats_ = SamplingStatistics(); + auto bytes = new std::vector(); + bytes->reserve(kSamplesBufferDefaultSize); + cur_cpu_writer_ = new ThreadSamplesBuffer(bytes); + return should; +} + +void ContinuousProfiler::PublishBuffer() +{ + ThreadSamplingRecordProducedThreadSample(cur_cpu_writer_->buffer_); + delete cur_cpu_writer_; + cur_cpu_writer_ = nullptr; + stats_ = SamplingStatistics(); +} + +[[nodiscard]] FunctionIdentifier NamingHelper::GetFunctionIdentifier(const FunctionID func_id, + const COR_PRF_FRAME_INFO frame_info) const +{ + if (func_id == 0) + { + constexpr auto zero_valid_function_identifier = FunctionIdentifier{0, 0, true}; + return zero_valid_function_identifier; + } + + ModuleID module_id = 0; + mdToken function_token = 0; + // theoretically there is a possibility to use GetFunctionInfo method, but it does not support generic methods + const HRESULT hr = + info12_->GetFunctionInfo2(func_id, frame_info, nullptr, &module_id, &function_token, 0, nullptr, nullptr); + if (FAILED(hr)) + { + trace::Logger::Debug("GetFunctionInfo2 failed. HRESULT=0x", std::setfill('0'), std::setw(8), std::hex, hr); + constexpr auto zero_invalid_function_identifier = FunctionIdentifier{0, 0, false}; + return zero_invalid_function_identifier; + } + + return FunctionIdentifier{function_token, module_id, true}; +} + +void NamingHelper::GetFunctionName(FunctionIdentifier function_identifier, trace::WSTRING& result) +{ + constexpr auto unknown_list_of_arguments = WStr("(unknown)"); + constexpr auto unknown_function_name = WStr("Unknown(unknown)"); + + if (!function_identifier.is_valid) + { + result.append(unknown_function_name); + return; + } + + if (function_identifier.function_token == 0) + { + constexpr auto unknown_native_function_name = WStr("Unknown_Native_Function(unknown)"); + result.append(unknown_native_function_name); + return; + } + + ComPtr metadata_import; + HRESULT hr = info12_->GetModuleMetaData(function_identifier.module_id, ofRead, IID_IMetaDataImport2, + reinterpret_cast(&metadata_import)); + if (FAILED(hr)) + { + trace::Logger::Debug("GetModuleMetaData failed. HRESULT=0x", std::setfill('0'), std::setw(8), std::hex, hr); + result.append(unknown_function_name); + return; + } + + const auto function_info = GetFunctionInfo(metadata_import, function_identifier.function_token); + + if (!function_info.IsValid()) + { + trace::Logger::Debug("GetFunctionInfo failed. HRESULT=0x", std::setfill('0'), std::setw(8), std::hex, hr); + result.append(unknown_function_name); + return; + } + + result.append(function_info.type.name); + result.append(name_separator); + result.append(function_info.name); + + HCORENUM function_gen_params_enum = nullptr; + HCORENUM class_gen_params_enum = nullptr; + mdGenericParam function_generic_params[kGenericParamsMaxLen]{}; + mdGenericParam class_generic_params[kGenericParamsMaxLen]{}; + ULONG function_gen_params_count = 0; + ULONG class_gen_params_count = 0; + + mdTypeDef class_token = function_info.type.id; + + hr = metadata_import->EnumGenericParams(&class_gen_params_enum, class_token, class_generic_params, + kGenericParamsMaxLen, &class_gen_params_count); + metadata_import->CloseEnum(class_gen_params_enum); + if (FAILED(hr)) + { + trace::Logger::Debug("Class generic parameters enumeration failed. HRESULT=0x", std::setfill('0'), std::setw(8), + std::hex, hr); + result.append(unknown_list_of_arguments); + return; + } + + hr = metadata_import->EnumGenericParams(&function_gen_params_enum, function_identifier.function_token, + function_generic_params, kGenericParamsMaxLen, &function_gen_params_count); + metadata_import->CloseEnum(function_gen_params_enum); + if (FAILED(hr)) + { + trace::Logger::Debug("Method generic parameters enumeration failed. HRESULT=0x", std::setfill('0'), + std::setw(8), std::hex, hr); + result.append(unknown_list_of_arguments); + return; + } + + if (function_gen_params_count > 0) + { + result.append(kGenericParamsOpeningBrace); + for (ULONG i = 0; i < function_gen_params_count; ++i) + { + if (i != 0) + { + result.append(kParamsSeparator); + } + + WCHAR param_type_name[kParamNameMaxLen]{}; + ULONG pch_name = 0; + hr = metadata_import->GetGenericParamProps(function_generic_params[i], nullptr, nullptr, nullptr, nullptr, + param_type_name, kParamNameMaxLen, &pch_name); + if (FAILED(hr)) + { + trace::Logger::Debug("GetGenericParamProps failed. HRESULT=0x", std::setfill('0'), std::setw(8), + std::hex, hr); + result.append(kUnknown); + } + else + { + result.append(param_type_name); + } + } + result.append(kGenericParamsClosingBrace); + } + + // try to list arguments type + FunctionMethodSignature function_method_signature = function_info.method_signature; + hr = function_method_signature.TryParse(); + if (FAILED(hr)) + { + result.append(unknown_list_of_arguments); + trace::Logger::Debug("FunctionMethodSignature parsing failed. HRESULT=0x", std::setfill('0'), std::setw(8), + std::hex, hr); + } + else + { + const auto& arguments = function_method_signature.GetMethodArguments(); + result.append(kFunctionParamsOpeningBrace); + for (ULONG i = 0; i < arguments.size(); i++) + { + if (i != 0) + { + result.append(kParamsSeparator); + } + + result.append(arguments[i].GetTypeTokName(metadata_import, class_generic_params, function_generic_params)); + } + result.append(kFunctionParamsClosingBrace); + } +} + +trace::WSTRING* NamingHelper::Lookup(FunctionID fid, COR_PRF_FRAME_INFO frame, SamplingStatistics& stats) +{ + // This method is using two layers of caching + // 1st layer depends on FunctionID which is volatile (and valid only within one thread suspension) + // 2nd layer depends on mdToken for function (which is stable) and ModuleId which could be volatile, + // but the pair should be stable enough to avoid any overlaps. + + const std::pair volatile_answer = volatile_function_name_cache_.Get(fid); + if (volatile_answer.first != nullptr) + { + function_name_cache_.Refresh(volatile_answer.second); + return volatile_answer.first; + } + + const auto function_identifier = this->GetFunctionIdentifier(fid, frame); + + trace::WSTRING* answer = function_name_cache_.Get(function_identifier); + if (answer != nullptr) + { + volatile_function_name_cache_.Put(fid, std::pair(answer, function_identifier)); + return answer; + } + stats.name_cache_misses++; + answer = new trace::WSTRING(); + this->GetFunctionName(function_identifier, *answer); + + const auto old_value = function_name_cache_.Put(function_identifier, answer); + delete old_value; + + volatile_function_name_cache_.Put(fid, std::pair(answer, function_identifier)); + return answer; +} + +// This is slightly messy since we an only pass one parameter to the FrameCallback +// but we have some slightly different use cases (but want to use the same stack capture +// code for allocations and paused thread samples) +struct DoStackSnapshotParams +{ + ContinuousProfiler* prof; + ThreadSamplesBuffer* buffer; + DoStackSnapshotParams(ContinuousProfiler* p, ThreadSamplesBuffer* b) : prof(p), buffer(b) + { + } +}; + +HRESULT __stdcall FrameCallback(_In_ FunctionID func_id, + _In_ UINT_PTR ip, + _In_ COR_PRF_FRAME_INFO frame_info, + _In_ ULONG32 context_size, + _In_ BYTE context[], + _In_ void* client_data) +{ + const auto params = static_cast(client_data); + params->prof->stats_.total_frames++; + const trace::WSTRING* name = params->prof->helper.Lookup(func_id, frame_info, params->prof->stats_); + // This is where line numbers could be calculated + params->buffer->RecordFrame(func_id, *name); + return S_OK; +} + +// Factored out from the loop to a separate function for easier auditing and control of the thread state lock +void CaptureSamples(ContinuousProfiler* prof, ICorProfilerInfo12* info12) +{ + ICorProfilerThreadEnum* thread_enum = nullptr; + HRESULT hr = info12->EnumThreads(&thread_enum); + if (FAILED(hr)) + { + trace::Logger::Debug("Could not EnumThreads. HRESULT=0x", std::setfill('0'), std::setw(8), std::hex, hr); + return; + } + ThreadID thread_id; + ULONG num_returned = 0; + + prof->helper.volatile_function_name_cache_.Clear(); + prof->cur_cpu_writer_->StartBatch(); + DoStackSnapshotParams dssp = DoStackSnapshotParams(prof, prof->cur_cpu_writer_); + while ((hr = thread_enum->Next(1, &thread_id, &num_returned)) == S_OK) + { + prof->stats_.num_threads++; + thread_span_context spanContext = thread_span_context_map[thread_id]; + auto found = prof->managed_tid_to_state_.find(thread_id); + if (found != prof->managed_tid_to_state_.end() && found->second != nullptr) + { + prof->cur_cpu_writer_->StartSample(thread_id, found->second, spanContext); + } + else + { + auto unknown = ThreadState(); + prof->cur_cpu_writer_->StartSample(thread_id, &unknown, spanContext); + } + + // Don't reuse the hr being used for the thread enum, especially since a failed snapshot isn't fatal + HRESULT snapshotHr = + info12->DoStackSnapshot(thread_id, &FrameCallback, COR_PRF_SNAPSHOT_DEFAULT, &dssp, nullptr, 0); + if (FAILED(snapshotHr)) + { + trace::Logger::Debug("DoStackSnapshot failed. HRESULT=0x", std::setfill('0'), std::setw(8), std::hex, + snapshotHr); + } + prof->cur_cpu_writer_->EndSample(); + } + prof->cur_cpu_writer_->EndBatch(); +} + +void PauseClrAndCaptureSamples(ContinuousProfiler* prof, ICorProfilerInfo12* info12) +{ + // before trying to suspend the runtime, acquire exclusive lock + // it's not safe to try to suspend the runtime after other locks are acquired + // if there is application thread in the middle of AllocationTick + std::unique_lock unique_lock(profiling_lock); + + // These locks are in use by managed threads; Acquire locks before suspending the runtime to prevent deadlock + // Any of these can be in use by random app/clr threads, but this is the only + // place that acquires more than one lock at a time. + std::lock_guard thread_state_guard(prof->thread_state_lock_); + std::lock_guard span_context_guard(thread_span_context_lock); + std::lock_guard name_cache_guard(name_cache_lock); + + const auto start = std::chrono::steady_clock::now(); + + HRESULT hr = info12->SuspendRuntime(); + if (FAILED(hr)) + { + trace::Logger::Warn("Could not suspend runtime to sample threads. HRESULT=0x", std::setfill('0'), std::setw(8), + std::hex, hr); + } + else + { + try + { + CaptureSamples(prof, info12); + } + catch (const std::exception& e) + { + trace::Logger::Warn("Could not capture thread samples: ", e.what()); + } + catch (...) + { + trace::Logger::Warn("Could not capture thread sample for unknown reasons"); + } + } + // I don't have any proof but I sure hope that if suspending fails then it's still ok to ask to resume, with no + // ill effects + hr = info12->ResumeRuntime(); + if (FAILED(hr)) + { + trace::Logger::Error("Could not resume runtime? HRESULT=0x", std::setfill('0'), std::setw(8), std::hex, hr); + } + + const auto end = std::chrono::steady_clock::now(); + const auto elapsed_micros = std::chrono::duration_cast(end - start).count(); + prof->stats_.micros_suspended = static_cast(elapsed_micros); + prof->cur_cpu_writer_->WriteFinalStats(prof->stats_); + trace::Logger::Debug("Threads sampled in ", elapsed_micros, " micros. threads=", prof->stats_.num_threads, + " frames=", prof->stats_.total_frames, " misses=", prof->stats_.name_cache_misses); + + prof->PublishBuffer(); +} + +void SleepMillis(unsigned int millis) +{ +#ifdef _WIN32 + Sleep(millis); +#else + usleep(millis * 1000); // micros +#endif +} + +DWORD WINAPI SamplingThreadMain(_In_ LPVOID param) +{ + const auto prof = static_cast(param); + ICorProfilerInfo12* info12 = prof->info12; + + info12->InitializeCurrentThread(); + + while (true) + { + SleepMillis(prof->threadSamplingInterval); + const bool shouldSample = prof->AllocateBuffer(); + if (!shouldSample) + { + trace::Logger::Warn( + "Skipping a thread sample period, buffers are full. ** THIS WILL RESULT IN LOSS OF PROFILING DATA **"); + } + else + { + PauseClrAndCaptureSamples(prof, info12); + } + } +} + +void ContinuousProfiler::SetGlobalInfo12(ICorProfilerInfo12* cor_profiler_info12) +{ + profiler_info = cor_profiler_info12; + this->info12 = cor_profiler_info12; + this->helper.info12_ = cor_profiler_info12; +} + +void ContinuousProfiler::StartThreadSampling(const unsigned int threadSamplingInterval) +{ + trace::Logger::Info("ContinuousProfiler::StartThreadSampling"); + this->threadSamplingInterval = threadSamplingInterval; +#ifdef _WIN32 + CreateThread(nullptr, 0, &SamplingThreadMain, this, 0, nullptr); +#else + pthread_t thr; + pthread_create(&thr, NULL, (void* (*)(void*)) & SamplingThreadMain, this); +#endif +} + +thread_span_context GetCurrentSpanContext(ThreadID tid) +{ + std::lock_guard guard(thread_span_context_lock); + return thread_span_context_map[tid]; +} + +ThreadState* ContinuousProfiler::GetCurrentThreadState(ThreadID tid) +{ + std::lock_guard guard(thread_state_lock_); + return managed_tid_to_state_[tid]; +} + +// You can read about the ETW event format for AllocationTick at +// https://docs.microsoft.com/en-us/dotnet/framework/performance/garbage-collection-etw-events#gcallocationtick_v3-event +// or, if that is not working, a search for "GCAllocationTick ETW" will get you there. +// As of this comment, the above link only documents v3 of the event, with v4 undocumented but +// by source traversal differs only by the addition of the actual size of the just-allocated object +// Do not be fooled by "AllocationAmount" which is set to the 100kb sampling limit. + +// https://github.com/dotnet/runtime/blob/cdb6e1d5f9075214c8a58ca75d5314b5dc64daed/src/coreclr/vm/ClrEtwAll.man#L1157 + +// AllocationAmount int32 +// AllocationKind int32 +// InstanceId int16 +// AllocationAmount64 int64 +// TypeId pointer +// TypeName ucs2 string, null terminated, variable length +// HeapIndex int32 +// Address pointer +// AllocatedSize int64 + +constexpr auto EtwPointerSize = sizeof(void*); +constexpr auto AllocationTickV4TypeNameStartByteIndex = 4 + 4 + 2 + 8 + EtwPointerSize; +constexpr auto AllocationTickV4SizeWithoutTypeName = 4 + 4 + 2 + 8 + EtwPointerSize + 4 + EtwPointerSize + 8; + +void CaptureAllocationStack(ContinuousProfiler* prof, ThreadSamplesBuffer* buffer) +{ + std::lock_guard guard(name_cache_lock); + // Read explanation of volatile clearing in NamingHelper::Lookup + prof->helper.volatile_function_name_cache_.Clear(); + DoStackSnapshotParams dssp = DoStackSnapshotParams(prof, buffer); + HRESULT hr = + prof->info12->DoStackSnapshot((ThreadID)NULL, &FrameCallback, COR_PRF_SNAPSHOT_DEFAULT, &dssp, nullptr, 0); + if (FAILED(hr)) + { + trace::Logger::Debug("DoStackSnapshot failed. HRESULT=0x", std::setfill('0'), std::setw(8), std::hex, hr); + } +} + +AllocationSubSampler::AllocationSubSampler(uint32_t targetPerCycle_, uint32_t secondsPerCycle_) + : targetPerCycle(targetPerCycle_) + , secondsPerCycle(secondsPerCycle_) + , seenThisCycle(0) + , sampledThisCycle(0) + , seenLastCycle(0) + , nextCycleStartMillis( + std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch())) + , sampleLock() + , rand(std::default_random_engine((unsigned int)(nextCycleStartMillis.count()))) +{ +} +void AllocationSubSampler::AdvanceCycle(std::chrono::milliseconds now) +{ + nextCycleStartMillis = now + std::chrono::seconds(secondsPerCycle); + seenLastCycle = seenThisCycle; + seenThisCycle = 0; + sampledThisCycle = 0; +} + +// We want to sample T items out of N per unit time, where N is unknown and may be < T or may be orders +// of magnitude bigger than T. One excellent approach for this is reservoir sampling, where new items +// displace existing samples such that the end result is a uniform sample of N. However, our overhead is not +// just limited to the subscription to the AllocationTick events, but also the additional +// captured data (e.g., the stack trace, locking and copying the span context). Therefore, reservoir "replacements" +// where an already-captured item gets displaced by a new one add additional undesired overhead. How much? +// Well, some monte carlo experiments with (e.g.) T=100 and N=1000 suggest that the wasted overhead on unsent data +// would be Waste~=230, a tremendous waste of CPU cycles to collect and then discard 230 stack traces, etc. +// Instead, let's treat the current cycle as statistically very similar to the last one, and sample 1/X events +// where X is based on what N was last time. Not the most elegant approach, but simple to code and errs on the +// side of reduced/capped overhead. +bool AllocationSubSampler::ShouldSample() +{ + std::lock_guard guard(sampleLock); + + auto now = + std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()); + if (now > nextCycleStartMillis) + { + AdvanceCycle(now); + } + seenThisCycle++; + if (sampledThisCycle >= targetPerCycle) + { + return false; + } + // roll a [1,lastCycle] die, and if it comes up <= targetPerCycle, it wins + // But lastCycle could be 0, so normalize that to 1. + std::uniform_int_distribution rando(1, std::max(seenLastCycle, (uint32_t)1)); + bool sample = rando(rand) <= targetPerCycle; + if (sample) + { + sampledThisCycle++; + } + return sample; +} + +void ContinuousProfiler::AllocationTick(ULONG dataLen, LPCBYTE data) +{ + // try to acquire shared lock without blocking + // and return early if attempt was unsuccessful - + // PauseClrAndCaptureSamples acquired exclusive lock + // and it's not safe to proceed + std::shared_lock shared_lock(profiling_lock, std::try_to_lock); + if (!shared_lock.owns_lock()) + { + // can't continue if suspension already started + trace::Logger::Debug("Possible runtime suspension in progress, can't safely process allocation tick."); + return; + } + if (this->allocationSubSampler == nullptr || !this->allocationSubSampler->ShouldSample()) + { + return; + } + + // In v4 it's the last field, so use a relative offset from the end + uint64_t allocatedSize = *((uint64_t*)&(data[dataLen - 8])); + // Here's the first byte of the typeName + WCHAR* typeName = (WCHAR*)&data[AllocationTickV4TypeNameStartByteIndex]; + + // and its length can be derived without iterating it since there is only the one variable-length field + // account for the null char + size_t typeNameCharLen = (dataLen - AllocationTickV4SizeWithoutTypeName) / 2 - 1; + + ThreadID threadId; + const HRESULT hr = info12->GetCurrentThreadID(&threadId); + if (FAILED(hr)) + { + trace::Logger::Debug("GetCurrentThreadId failed, ", hr); + return; + } + auto unknownThreadState = ThreadState(); + auto spanCtx = GetCurrentSpanContext(threadId); + auto threadState = GetCurrentThreadState(threadId); + if (threadState == nullptr) + { + threadState = &unknownThreadState; + } + // Note that by using a local buffer that we will copy as a whole into the + // "main" one later, we gain atomicity and improved concurrency, but lose out on a shared + // string-coding dictionary for all the allocation samples in a cycle. The tradeoffs here + // are non-obvious and the code+locking complexity to share codes would be high, so this will do + // until proven otherwise. The managed code specifically understands that the strings in each + // allocation sample are coded separately so if this changes, that code will need to change too. + std::vector localBytes; + ThreadSamplesBuffer localBuf = ThreadSamplesBuffer(&localBytes); + localBuf.AllocationSample(allocatedSize, typeName, typeNameCharLen, threadId, threadState, spanCtx); + CaptureAllocationStack(this, &localBuf); + localBuf.EndSample(); + AllocationSamplingAppendToBuffer(static_cast(localBytes.size()), localBytes.data()); +} + +void ContinuousProfiler::StartAllocationSampling(const unsigned int maxMemorySamplesPerMinute) +{ + this->allocationSubSampler = new AllocationSubSampler(maxMemorySamplesPerMinute, 60); + + EVENTPIPE_SESSION session; + COR_PRF_EVENTPIPE_PROVIDER_CONFIG sessionConfig[] = {{WStr("Microsoft-Windows-DotNETRuntime"), + 0x1, // CLR_GC_KEYWORD + // documentation says AllocationTick is at info but it lies + COR_PRF_EVENTPIPE_VERBOSE, nullptr}}; + HRESULT hr = this->info12->EventPipeStartSession(1, sessionConfig, false, &session); + if (FAILED(hr)) + { + trace::Logger::Error("Could not enable allocation sampling: session pipe error", hr); + } + + trace::Logger::Info("ContinuousProfiler::MemoryProfiling started."); +} + +void ContinuousProfiler::ThreadCreated(ThreadID thread_id) +{ + // So it seems the Thread* items can be/are called out of order. ThreadCreated doesn't carry any valuable + // ThreadState information so this is a deliberate nop. The other methods will fault in ThreadStates + // as needed. + // Hopefully the destroyed event is not called out of order with the others... if so, the worst that happens + // is we get an empty name string and a 0 in the native ID column +} +void ContinuousProfiler::ThreadDestroyed(ThreadID thread_id) +{ + { + std::lock_guard guard(thread_state_lock_); + + const ThreadState* state = managed_tid_to_state_[thread_id]; + + delete state; + + managed_tid_to_state_.erase(thread_id); + } + { + std::lock_guard guard(thread_span_context_lock); + + thread_span_context_map.erase(thread_id); + } +} +void ContinuousProfiler::ThreadNameChanged(ThreadID thread_id, ULONG cch_name, WCHAR name[]) +{ + std::lock_guard guard(thread_state_lock_); + + ThreadState* state = managed_tid_to_state_[thread_id]; + if (state == nullptr) + { + state = new ThreadState(); + managed_tid_to_state_[thread_id] = state; + } + state->thread_name_.clear(); + state->thread_name_.append(name, cch_name); +} + +template +NameCache::NameCache(const size_t maximum_size, const TValue default_value) + : max_size_(maximum_size), default_value_(default_value) +{ +} + +template +TValue NameCache::Get(TKey key) +{ + const auto found = map_.find(key); + if (found == map_.end()) + { + return default_value_; + } + // This voodoo moves the single item in the iterator to the front of the list + // (as it is now the most-recently-used) + list_.splice(list_.begin(), list_, found->second); + return found->second->second; +} + +template +void NameCache::Refresh(TKey key) +{ + const auto found = map_.find(key); + if (found == map_.end()) + { + return; + } + // This voodoo moves the single item in the iterator to the front of the list + // (as it is now the most-recently-used) + list_.splice(list_.begin(), list_, found->second); +} + +template +TValue NameCache::Put(TKey key, TValue val) +{ + const auto pair = std::pair(key, val); + list_.push_front(pair); + map_[key] = list_.begin(); + + if (map_.size() > max_size_) + { + const auto& lru = list_.back(); + const auto old_value = lru.second; + map_.erase(lru.first); + list_.pop_back(); + return old_value; + } + return default_value_; +} + +template +void NameCache::Clear() +{ + map_.clear(); + list_.clear(); +} + +} // namespace continuous_profiler + +extern "C" { +EXPORTTHIS int32_t ContinuousProfilerReadThreadSamples(int32_t len, unsigned char* buf) +{ + return ThreadSamplingConsumeOneThreadSample(len, buf); +} +EXPORTTHIS int32_t ContinuousProfilerReadAllocationSamples(int32_t len, unsigned char* buf) +{ + return AllocationSamplingConsumeAndReplaceBuffer(len, buf); +} +EXPORTTHIS void ContinuousProfilerSetNativeContext(uint64_t traceIdHigh, + uint64_t traceIdLow, + uint64_t spanId, + int32_t managedThreadId) +{ + ThreadID threadId; + const HRESULT hr = profiler_info->GetCurrentThreadID(&threadId); + if (FAILED(hr)) + { + trace::Logger::Debug("GetCurrentThreadID failed. HRESULT=0x", std::setfill('0'), std::setw(8), std::hex, hr); + return; + } + + std::lock_guard guard(thread_span_context_lock); + + thread_span_context_map[threadId] = + continuous_profiler::thread_span_context(traceIdHigh, traceIdLow, spanId, managedThreadId); +} +} diff --git a/src/OpenTelemetry.AutoInstrumentation.Native/continuous_profiler.h b/src/OpenTelemetry.AutoInstrumentation.Native/continuous_profiler.h new file mode 100644 index 0000000000..68c8657112 --- /dev/null +++ b/src/OpenTelemetry.AutoInstrumentation.Native/continuous_profiler.h @@ -0,0 +1,244 @@ +/* + * Copyright The OpenTelemetry Authors + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef OTEL_CONTINUOUS_PROFILER_H_ +#define OTEL_CONTINUOUS_PROFILER_H_ + +#include "continuous_profiler_clr_helpers.h" + +#include +#include +#include +#include +#include +#include +#include + +constexpr auto unknown_managed_thread_id = -1; + +#ifdef _WIN32 +#define EXPORTTHIS __declspec(dllexport) +#else +#define EXPORTTHIS __attribute__((visibility("default"))) +#endif + +extern "C" +{ + EXPORTTHIS int32_t ContinuousProfilerReadThreadSamples(int32_t len, unsigned char* buf); + EXPORTTHIS int32_t ContinuousProfilerReadAllocationSamples(int32_t len, unsigned char* buf); + // ReSharper disable CppInconsistentNaming + EXPORTTHIS void ContinuousProfilerSetNativeContext(uint64_t traceIdHigh, uint64_t traceIdLow, uint64_t spanId, int32_t managedThreadId); + // ReSharper restore CppInconsistentNaming +} + +namespace continuous_profiler +{ +struct SamplingStatistics +{ + int micros_suspended; + int num_threads; + int total_frames; + int name_cache_misses; + SamplingStatistics() : micros_suspended(0), num_threads(0), total_frames(0), name_cache_misses(0) + { + } + SamplingStatistics(SamplingStatistics const& other) : + micros_suspended(other.micros_suspended), + num_threads(other.num_threads), + total_frames(other.total_frames), + name_cache_misses(other.name_cache_misses) + { + } +}; + +class thread_span_context +{ +public: + uint64_t trace_id_high_; + uint64_t trace_id_low_; + uint64_t span_id_; + int32_t managed_thread_id_; + + thread_span_context() : trace_id_high_(0), trace_id_low_(0), span_id_(0), managed_thread_id_(unknown_managed_thread_id) + { + } + thread_span_context(uint64_t _traceIdHigh, uint64_t _traceIdLow, uint64_t _spanId, int32_t managedThreadId) : + trace_id_high_(_traceIdHigh), trace_id_low_(_traceIdLow), span_id_(_spanId), managed_thread_id_(managedThreadId) + { + } + thread_span_context(thread_span_context const& other) : + trace_id_high_(other.trace_id_high_), trace_id_low_(other.trace_id_low_), span_id_(other.span_id_), managed_thread_id_(other.managed_thread_id_) + { + } +}; + +class ThreadState +{ +public: + trace::WSTRING thread_name_; + ThreadState() + { + } + ThreadState(ThreadState const& other) : thread_name_(other.thread_name_) + { + } +}; + +class ThreadSamplesBuffer +{ +public: + std::unordered_map codes_; + std::vector* buffer_; + + explicit ThreadSamplesBuffer(std::vector* buf); + ~ThreadSamplesBuffer(); + void StartBatch() const; + void StartSample(ThreadID id, const ThreadState* state, const thread_span_context& span_context) const; + void RecordFrame(FunctionID fid, const trace::WSTRING& frame); + void EndSample() const; + void EndBatch() const; + void WriteFinalStats(const SamplingStatistics& stats) const; + void AllocationSample(uint64_t allocSize, const WCHAR* allocType, size_t allocTypeCharLen, ThreadID id, const ThreadState* state, const thread_span_context& span_context) const; + +private: + void WriteCurrentTimeMillis() const; + void WriteCodedFrameString(FunctionID fid, const trace::WSTRING& str); + void WriteShort(int16_t val) const; + void WriteInt(int32_t val) const; + void WriteString(const WCHAR* s, size_t len) const; + void WriteString(const trace::WSTRING& str) const; + void WriteByte(unsigned char b) const; + void WriteUInt64(uint64_t val) const; +}; + +struct FunctionIdentifier +{ + mdToken function_token; + ModuleID module_id; + bool is_valid; + + bool operator==(const FunctionIdentifier& p) const + { + return function_token == p.function_token && module_id == p.module_id && is_valid == p.is_valid; + } +}; +} // namespace continuous_profiler + +template <> +struct std::hash +{ + std::size_t operator()(const continuous_profiler::FunctionIdentifier& k) const noexcept + { + using std::hash; + using std::size_t; + using std::string; + + const std::size_t h1 = std::hash()(k.function_token); + const std::size_t h2 = std::hash()(k.module_id); + + return h1 ^ h2; + } +}; + +namespace continuous_profiler +{ +template +class NameCache +{ +// ModuleID is volatile but it is unlikely to have exactly same pair of Function Token and ModuleId after changes. +// If fails we should end up we Unknown(unknown) as a result +public: + explicit NameCache(size_t maximum_size, TValue default_value); + TValue Get(TKey key); + void Refresh(TKey key); + // if max cache size is exceeded it return value which should be disposed + TValue Put(TKey key, TValue val); + void Clear(); + +private: + TValue default_value_; + size_t max_size_; + std::list> list_; + std::unordered_map>::iterator> map_; +}; + +class NamingHelper +{ +public: + // These are permanent parts of the helper object + ICorProfilerInfo12* info12_ = nullptr; + NameCache function_name_cache_; + NameCache> volatile_function_name_cache_; + + NamingHelper(); + trace::WSTRING* Lookup(FunctionID fid, COR_PRF_FRAME_INFO frame, SamplingStatistics & stats); + +private: + [[nodiscard]] FunctionIdentifier GetFunctionIdentifier(const FunctionID func_id, + const COR_PRF_FRAME_INFO frame_info) const; + void GetFunctionName(FunctionIdentifier function_identifier, trace::WSTRING& result); + +}; + +// We can get more AllocationTick events than we reasonably want to push to the cloud; this +// structure/logic helps us rate-control this effect. More details about the algorithm are in the +// implementation of ShouldSample(). +class AllocationSubSampler +{ +public: + AllocationSubSampler(uint32_t targetPerCycle, uint32_t secondsPerCycle); + bool ShouldSample(); + // internal implementation detail that is public for unit testing purposes + void AdvanceCycle(std::chrono::milliseconds now); + +private: + uint32_t targetPerCycle; + uint32_t secondsPerCycle; + uint32_t seenThisCycle; + uint32_t sampledThisCycle; + uint32_t seenLastCycle; + std::chrono::milliseconds nextCycleStartMillis; + std::mutex sampleLock; + std::default_random_engine rand; +}; + +class ContinuousProfiler +{ +public: + unsigned int threadSamplingInterval; + void StartThreadSampling(unsigned int threadSamplingInterval); + unsigned int maxMemorySamplesPerMinute; + void StartAllocationSampling(unsigned int maxMemorySamplesPerMinute); + void AllocationTick(ULONG dataLen, LPCBYTE data); + ICorProfilerInfo12* info12; + static void ThreadCreated(ThreadID thread_id); + void ThreadDestroyed(ThreadID thread_id); + void ThreadNameChanged(ThreadID thread_id, ULONG cch_name, WCHAR name[]); + + void SetGlobalInfo12(ICorProfilerInfo12* info12); + ThreadState* GetCurrentThreadState(ThreadID tid); + + std::unordered_map managed_tid_to_state_; + std::mutex thread_state_lock_; + NamingHelper helper; + AllocationSubSampler* allocationSubSampler = nullptr; + + // These cycle every sample and/or are owned externally + ThreadSamplesBuffer* cur_cpu_writer_ = nullptr; + SamplingStatistics stats_; + bool AllocateBuffer(); + void PublishBuffer(); +}; + +} // namespace continuous_profiler + +void AllocationSamplingAppendToBuffer(int32_t appendLen, unsigned char* appendBuf); + +bool ThreadSamplingShouldProduceThreadSample(); +void ThreadSamplingRecordProducedThreadSample(std::vector* buf); +// Can return 0 if none are pending +int32_t ThreadSamplingConsumeOneThreadSample(int32_t len, unsigned char* buf); + +#endif // OTEL_CONTINUOUS_PROFILER_H_ diff --git a/src/OpenTelemetry.AutoInstrumentation.Native/continuous_profiler_clr_helpers.cpp b/src/OpenTelemetry.AutoInstrumentation.Native/continuous_profiler_clr_helpers.cpp new file mode 100644 index 0000000000..857daa99cc --- /dev/null +++ b/src/OpenTelemetry.AutoInstrumentation.Native/continuous_profiler_clr_helpers.cpp @@ -0,0 +1,368 @@ +/* + * Copyright The OpenTelemetry Authors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "continuous_profiler_clr_helpers.h" + +#include + +#include "otel_profiler_constants.h" +#include "environment_variables.h" +#include "logger.h" +#include "macros.h" +#include +#include + +#include "pal.h" + +// this file contains optimized, specified version of code from clr_helpers.cpp + +namespace continuous_profiler +{ +FunctionInfo GetFunctionInfo(const ComPtr& metadata_import, const mdToken& token) +{ + mdToken parent_token = mdTokenNil; + WCHAR function_name[trace::kNameMaxSize]{}; + DWORD function_name_len = 0; + + PCCOR_SIGNATURE raw_signature; + ULONG raw_signature_len; + + HRESULT hr = E_FAIL; + switch (const auto token_type = TypeFromToken(token)) + { + case mdtMemberRef: + hr = metadata_import->GetMemberRefProps(token, &parent_token, function_name, trace::kNameMaxSize, + &function_name_len, &raw_signature, &raw_signature_len); + break; + case mdtMethodDef: + hr = metadata_import->GetMemberProps(token, &parent_token, function_name, trace::kNameMaxSize, + &function_name_len, nullptr, &raw_signature, &raw_signature_len, + nullptr, nullptr, nullptr, nullptr, nullptr); + break; + case mdtMethodSpec: + { + hr = metadata_import->GetMethodSpecProps(token, &parent_token, &raw_signature, &raw_signature_len); + if (FAILED(hr)) + { + return {}; + } + const auto generic_info = GetFunctionInfo(metadata_import, parent_token); + std::memcpy(function_name, generic_info.name.c_str(), sizeof(WCHAR) * (generic_info.name.length() + 1)); + function_name_len = DWORD(generic_info.name.length() + 1); + } + break; + default: + trace::Logger::Warn("[trace::GetFunctionInfo] unknown token type: {}", token_type); + return {}; + } + + if (FAILED(hr) || function_name_len == 0) + { + return {}; + } + + // parent_token could be: TypeDef, TypeRef, TypeSpec, ModuleRef, MethodDef + const auto type_info = GetTypeInfo(metadata_import, parent_token); + + return {token, trace::WSTRING(function_name), type_info, FunctionMethodSignature(raw_signature, raw_signature_len)}; +} + +TypeInfo GetTypeInfo(const ComPtr& metadata_import, const mdToken& token) +{ + std::shared_ptr parentTypeInfo = nullptr; + mdToken parent_type_token = mdTokenNil; + WCHAR type_name[trace::kNameMaxSize]{}; + DWORD type_name_len = 0; + DWORD type_flags; + + HRESULT hr = E_FAIL; + + switch (const auto token_type = TypeFromToken(token)) + { + case mdtTypeDef: + hr = metadata_import->GetTypeDefProps(token, type_name, trace::kNameMaxSize, &type_name_len, &type_flags, + nullptr); + + metadata_import->GetNestedClassProps(token, &parent_type_token); + if (parent_type_token != mdTokenNil) + { + parentTypeInfo = std::make_shared(GetTypeInfo(metadata_import, parent_type_token)); + } + break; + case mdtTypeRef: + hr = metadata_import->GetTypeRefProps(token, nullptr, type_name, trace::kNameMaxSize, &type_name_len); + break; + case mdtTypeSpec: + { + PCCOR_SIGNATURE signature{}; + ULONG signature_length{}; + + hr = metadata_import->GetTypeSpecFromToken(token, &signature, &signature_length); + + if (FAILED(hr) || signature_length < 3) + { + return {}; + } + + if (signature[0] & ELEMENT_TYPE_GENERICINST) + { + mdToken type_token; + CorSigUncompressToken(&signature[2], &type_token); + const auto baseType = GetTypeInfo(metadata_import, type_token); + return {baseType.id, baseType.name}; + } + } + break; + case mdtModuleRef: + metadata_import->GetModuleRefProps(token, type_name, trace::kNameMaxSize, &type_name_len); + break; + case mdtMemberRef: + case mdtMethodDef: + return GetFunctionInfo(metadata_import, token).type; + } + + if (FAILED(hr) || type_name_len == 0) + { + return {}; + } + + trace::WSTRING type_name_string; + + if (parentTypeInfo != nullptr) + { + type_name_string = parentTypeInfo->name + name_separator + trace::WSTRING(type_name); + } + else + { + type_name_string = trace::WSTRING(type_name); + } + return {token, type_name_string}; +} + +trace::WSTRING ExtractParameterName(PCCOR_SIGNATURE& pb_cur, + const ComPtr& metadata_import, + const mdGenericParam* generic_parameters) +{ + pb_cur++; + ULONG num = 0; + pb_cur += CorSigUncompressData(pb_cur, &num); + if (num >= kGenericParamsMaxLen) + { + return kUnknown; + } + WCHAR param_type_name[kParamNameMaxLen]{}; + ULONG pch_name = 0; + const auto hr = metadata_import->GetGenericParamProps(generic_parameters[num], nullptr, nullptr, nullptr, nullptr, + param_type_name, kParamNameMaxLen, &pch_name); + if (FAILED(hr)) + { + trace::Logger::Debug("GetGenericParamProps failed. HRESULT=0x", std::setfill('0'), std::setw(8), std::hex, hr); + return kUnknown; + } + return param_type_name; +} + +trace::WSTRING GetSigTypeTokNameNew(PCCOR_SIGNATURE& pb_cur, + const ComPtr& metadata_import, + mdGenericParam class_params[], + mdGenericParam method_params[]) +{ + trace::WSTRING token_name = trace::EmptyWStr; + bool ref_flag = false; + if (*pb_cur == ELEMENT_TYPE_BYREF) + { + pb_cur++; + ref_flag = true; + } + + bool pointer_flag = false; + if (*pb_cur == ELEMENT_TYPE_PTR) + { + pb_cur++; + pointer_flag = true; + } + + switch (*pb_cur) + { + case ELEMENT_TYPE_BOOLEAN: + token_name = trace::SystemBoolean; + pb_cur++; + break; + case ELEMENT_TYPE_CHAR: + token_name = trace::SystemChar; + pb_cur++; + break; + case ELEMENT_TYPE_I1: + token_name = trace::SystemSByte; + pb_cur++; + break; + case ELEMENT_TYPE_U1: + token_name = trace::SystemByte; + pb_cur++; + break; + case ELEMENT_TYPE_U2: + token_name = trace::SystemUInt16; + pb_cur++; + break; + case ELEMENT_TYPE_I2: + token_name = trace::SystemInt16; + pb_cur++; + break; + case ELEMENT_TYPE_I4: + token_name = trace::SystemInt32; + pb_cur++; + break; + case ELEMENT_TYPE_U4: + token_name = trace::SystemUInt32; + pb_cur++; + break; + case ELEMENT_TYPE_I8: + token_name = trace::SystemInt64; + pb_cur++; + break; + case ELEMENT_TYPE_U8: + token_name = trace::SystemUInt64; + pb_cur++; + break; + case ELEMENT_TYPE_R4: + token_name = trace::SystemSingle; + pb_cur++; + break; + case ELEMENT_TYPE_R8: + token_name = trace::SystemDouble; + pb_cur++; + break; + case ELEMENT_TYPE_I: + token_name = trace::SystemIntPtr; + pb_cur++; + break; + case ELEMENT_TYPE_U: + token_name = trace::SystemUIntPtr; + pb_cur++; + break; + case ELEMENT_TYPE_STRING: + token_name = trace::SystemString; + pb_cur++; + break; + case ELEMENT_TYPE_OBJECT: + token_name = trace::SystemObject; + pb_cur++; + break; + case ELEMENT_TYPE_CLASS: + case ELEMENT_TYPE_VALUETYPE: + { + pb_cur++; + mdToken token; + pb_cur += CorSigUncompressToken(pb_cur, &token); + token_name = GetTypeInfo(metadata_import, token).name; + break; + } + case ELEMENT_TYPE_SZARRAY: + { + pb_cur++; + token_name = GetSigTypeTokNameNew(pb_cur, metadata_import, class_params, method_params) + WStr("[]"); + break; + } + case ELEMENT_TYPE_GENERICINST: + { + pb_cur++; + token_name = GetSigTypeTokNameNew(pb_cur, metadata_import, class_params, method_params); + token_name += kGenericParamsOpeningBrace; + ULONG num = 0; + pb_cur += CorSigUncompressData(pb_cur, &num); + for (ULONG i = 0; i < num; i++) + { + token_name += GetSigTypeTokNameNew(pb_cur, metadata_import, class_params, method_params); + if (i != num - 1) + { + token_name += kParamsSeparator; + } + } + token_name += kGenericParamsClosingBrace; + break; + } + case ELEMENT_TYPE_MVAR: + { + token_name += ExtractParameterName(pb_cur, metadata_import, method_params); + break; + } + case ELEMENT_TYPE_VAR: + { + token_name += ExtractParameterName(pb_cur, metadata_import, class_params); + break; + } + default: + break; + } + + if (ref_flag) + { + token_name += WStr("&"); + } + + if (pointer_flag) + { + token_name += WStr("*"); + } + return token_name; +} + +trace::WSTRING TypeSignature::GetTypeTokName(ComPtr& pImport, + mdGenericParam class_params[], + mdGenericParam method_params[]) const +{ + PCCOR_SIGNATURE pbCur = &pbBase[offset]; + return GetSigTypeTokNameNew(pbCur, pImport, class_params, method_params); +} + +HRESULT FunctionMethodSignature::TryParse() +{ + PCCOR_SIGNATURE pbCur = pbBase; + PCCOR_SIGNATURE pbEnd = pbBase + len; + unsigned char elem_type; + + IfFalseRetFAIL(trace::ParseByte(pbCur, pbEnd, &elem_type)); + + if (elem_type & IMAGE_CEE_CS_CALLCONV_GENERIC) + { + unsigned gen_param_count; + IfFalseRetFAIL(trace::ParseNumber(pbCur, pbEnd, &gen_param_count)); + } + + unsigned param_count; + IfFalseRetFAIL(trace::ParseNumber(pbCur, pbEnd, ¶m_count)); + + IfFalseRetFAIL(trace::ParseRetType(pbCur, pbEnd)); + + auto fEncounteredSentinal = false; + for (unsigned i = 0; i < param_count; i++) + { + if (pbCur >= pbEnd) + return E_FAIL; + + if (*pbCur == ELEMENT_TYPE_SENTINEL) + { + if (fEncounteredSentinal) + return E_FAIL; + + fEncounteredSentinal = true; + pbCur++; + } + + const PCCOR_SIGNATURE pbParam = pbCur; + + IfFalseRetFAIL(trace::ParseParamOrLocal(pbCur, pbEnd)); + + TypeSignature argument{}; + argument.pbBase = pbBase; + argument.length = (ULONG)(pbCur - pbParam); + argument.offset = (ULONG)(pbCur - pbBase - argument.length); + + params.push_back(argument); + } + + return S_OK; +} +} // namespace continuous_profiler diff --git a/src/OpenTelemetry.AutoInstrumentation.Native/continuous_profiler_clr_helpers.h b/src/OpenTelemetry.AutoInstrumentation.Native/continuous_profiler_clr_helpers.h new file mode 100644 index 0000000000..b23a663f37 --- /dev/null +++ b/src/OpenTelemetry.AutoInstrumentation.Native/continuous_profiler_clr_helpers.h @@ -0,0 +1,116 @@ +/* + * Copyright The OpenTelemetry Authors + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef OTEL_CONTINUOUS_PROFILER_CLR_HELPERS_H_ +#define OTEL_CONTINUOUS_PROFILER_CLR_HELPERS_H_ + +#include +#include +#include +#include + +#include "integration.h" + +#include "./clr_helpers.h" + +#include + +// this file contains optimized, specified version of code from clr_helpers.h + +namespace continuous_profiler +{ +constexpr auto kParamNameMaxLen = 260; +constexpr auto kGenericParamsMaxLen = 20; +constexpr auto kUnknown = WStr("Unknown"); +constexpr auto kParamsSeparator = WStr(", "); +constexpr auto kGenericParamsOpeningBrace = WStr("["); +constexpr auto kGenericParamsClosingBrace = WStr("]"); +constexpr auto kFunctionParamsOpeningBrace = WStr("("); +constexpr auto kFunctionParamsClosingBrace = WStr(")"); +constexpr auto name_separator = WStr("."); + +struct TypeInfo +{ + const mdToken id; + const trace::WSTRING name; + + TypeInfo() : + id(0), + name(trace::EmptyWStr) + { + } + TypeInfo(const mdToken id, const trace::WSTRING name) : + id(id), + name(name) + { + } +}; + +struct TypeSignature +{ + ULONG offset; + ULONG length; + PCCOR_SIGNATURE pbBase; + + trace::WSTRING GetTypeTokName(ComPtr& pImport, mdGenericParam class_params[], + mdGenericParam method_params[]) const; +}; + +struct FunctionMethodSignature +{ +private: + PCCOR_SIGNATURE pbBase; + unsigned len; + std::vector params; + +public: + FunctionMethodSignature() : pbBase(nullptr), len(0) + { + } + FunctionMethodSignature(PCCOR_SIGNATURE pb, unsigned cbBuffer) + { + pbBase = pb; + len = cbBuffer; + }; + const std::vector& GetMethodArguments() const + { + return params; + } + HRESULT TryParse(); +}; + +struct FunctionInfo +{ + const mdToken id; + const trace::WSTRING name; + const TypeInfo type; + FunctionMethodSignature method_signature; + + FunctionInfo() : id(0), name(trace::EmptyWStr), type({}), method_signature({}) + { + } + + FunctionInfo(mdToken id, trace::WSTRING name, TypeInfo type, + FunctionMethodSignature method_signature) : + id(id), + name(name), + type(type), + method_signature(method_signature) + { + } + + bool IsValid() const + { + return id != 0; + } +}; + +FunctionInfo GetFunctionInfo(const ComPtr& metadata_import, const mdToken& token); + +TypeInfo GetTypeInfo(const ComPtr& metadata_import, const mdToken& token); + +} // namespace continuous_profiler + +#endif diff --git a/src/OpenTelemetry.AutoInstrumentation.Native/cor_profiler.cpp b/src/OpenTelemetry.AutoInstrumentation.Native/cor_profiler.cpp index 327018752d..bea3fc3d87 100644 --- a/src/OpenTelemetry.AutoInstrumentation.Native/cor_profiler.cpp +++ b/src/OpenTelemetry.AutoInstrumentation.Native/cor_profiler.cpp @@ -29,6 +29,7 @@ #include "stats.h" #include "util.h" #include "version.h" +#include "continuous_profiler.h" #ifdef MACOS #include @@ -62,7 +63,8 @@ CorProfiler* profiler = nullptr; // HRESULT STDMETHODCALLTYPE CorProfiler::Initialize(IUnknown* cor_profiler_info_unknown) { - auto _ = trace::Stats::Instance()->InitializeMeasure(); + auto _ = trace::Stats::Instance()->InitializeMeasure(); + this->continuousProfiler = nullptr; CorProfilerBase::Initialize(cor_profiler_info_unknown); @@ -103,7 +105,8 @@ HRESULT STDMETHODCALLTYPE CorProfiler::Initialize(IUnknown* cor_profiler_info_un if (SUCCEEDED(hr)) { Logger::Debug("Interface ICorProfilerInfo12 found."); - this->info_ = info12; + this->info_ = info12; + this->info12_ = info12; } else { @@ -113,7 +116,8 @@ HRESULT STDMETHODCALLTYPE CorProfiler::Initialize(IUnknown* cor_profiler_info_un { FailProfiler(Warn, "Failed to attach profiler: Not supported .NET Framework version (lower than 4.6.1).") } - info12 = nullptr; + info12 = nullptr; + this->info12_ = nullptr; } // code is ready to get runtime information @@ -987,7 +991,7 @@ HRESULT STDMETHODCALLTYPE CorProfiler::JITInlining(FunctionID callerId, Function ModuleID calleeModuleId; mdToken calleFunctionToken = mdTokenNil; - auto hr = this->info_->GetFunctionInfo(calleeId, NULL, &calleeModuleId, &calleFunctionToken); + auto hr = this->info_->GetFunctionInfo(calleeId, nullptr, &calleeModuleId, &calleFunctionToken); *pfShouldInline = true; @@ -1126,6 +1130,54 @@ void CorProfiler::InternalAddInstrumentation(WCHAR* id, CallTargetDefinition* it } } +void CorProfiler::ConfigureContinuousProfiler(bool threadSamplingEnabled, + unsigned int threadSamplingInterval, + bool allocationSamplingEnabled, + unsigned int maxMemorySamplesPerMinute) +{ + Logger::Info("ConfigureContinuousProfiler: thread sampling enabled: ", threadSamplingEnabled, + ", thread sampling interval: ", threadSamplingInterval, ", allocationSamplingEnabled: ", + allocationSamplingEnabled, ", max memory samples per minute: ", maxMemorySamplesPerMinute); + + if (!threadSamplingEnabled && !allocationSamplingEnabled) + { + Logger::Debug("ConfigureContinuousProfiler: Thread sampling and allocations sampling disabled."); + return; + } + + DWORD pdvEventsLow; + DWORD pdvEventsHigh; + auto hr = this->info12_->GetEventMask2(&pdvEventsLow, &pdvEventsHigh); + if (FAILED(hr)) + { + Logger::Warn("ConfigureContinuousProfiler: Failed to take event masks for continuous profiler."); + return; + } + + pdvEventsLow |= COR_PRF_MONITOR_THREADS | COR_PRF_ENABLE_STACK_SNAPSHOT; + + hr = this->info12_->SetEventMask2(pdvEventsLow, pdvEventsHigh); + if (FAILED(hr)) + { + Logger::Warn("ConfigureContinuousProfiler: Failed to set event masks for continuous profiler."); + return; + } + + this->continuousProfiler = new continuous_profiler::ContinuousProfiler(); + this->continuousProfiler->SetGlobalInfo12(this->info12_); + Logger::Info("ConfigureContinuousProfiler: Events masks configured for continuous profiler"); + + if (threadSamplingEnabled) + { + this->continuousProfiler->StartThreadSampling(threadSamplingInterval); + } + + if (allocationSamplingEnabled) + { + this->continuousProfiler->StartAllocationSampling(maxMemorySamplesPerMinute); + } +} + // // ICorProfilerCallback6 methods // @@ -2620,4 +2672,49 @@ HRESULT STDMETHODCALLTYPE CorProfiler::JITCachedFunctionSearchStarted(FunctionID return S_OK; } +HRESULT STDMETHODCALLTYPE CorProfiler::ThreadCreated(ThreadID threadId) +{ + if (continuousProfiler != nullptr) + { + continuousProfiler->ThreadCreated(threadId); + } + return S_OK; +} +HRESULT STDMETHODCALLTYPE CorProfiler::ThreadDestroyed(ThreadID threadId) +{ + if (continuousProfiler != nullptr) + { + continuousProfiler->ThreadDestroyed(threadId); + } + return S_OK; +} +HRESULT STDMETHODCALLTYPE CorProfiler::ThreadNameChanged(ThreadID threadId, ULONG cchName, WCHAR name[]) +{ + if (continuousProfiler != nullptr) + { + continuousProfiler->ThreadNameChanged(threadId, cchName, name); + } + return S_OK; +} + +HRESULT STDMETHODCALLTYPE CorProfiler::EventPipeEventDelivered(EVENTPIPE_PROVIDER provider, + DWORD eventId, + DWORD eventVersion, + ULONG cbMetadataBlob, + LPCBYTE metadataBlob, + ULONG cbEventData, + LPCBYTE eventData, + LPCGUID pActivityId, + LPCGUID pRelatedActivityId, + ThreadID eventThread, + ULONG numStackFrames, + UINT_PTR stackFrames[]) +{ + if (continuousProfiler != nullptr && eventId == 10 && eventVersion == 4) + { + continuousProfiler->AllocationTick(cbEventData, eventData); + } + return S_OK; +} + } // namespace trace \ No newline at end of file diff --git a/src/OpenTelemetry.AutoInstrumentation.Native/cor_profiler.h b/src/OpenTelemetry.AutoInstrumentation.Native/cor_profiler.h index 64e56b7d0b..bd49305b6c 100644 --- a/src/OpenTelemetry.AutoInstrumentation.Native/cor_profiler.h +++ b/src/OpenTelemetry.AutoInstrumentation.Native/cor_profiler.h @@ -24,6 +24,12 @@ #include #include "clr_helpers.h" +// Forward declaration +namespace continuous_profiler +{ +class ContinuousProfiler; +} + namespace trace { @@ -48,6 +54,9 @@ class CorProfiler : public CorProfilerBase bool in_azure_app_services = false; bool is_desktop_iis = false; + continuous_profiler::ContinuousProfiler* continuousProfiler; + + // // CallTarget Members // @@ -169,6 +178,25 @@ class CorProfiler : public CorProfilerBase HRESULT STDMETHODCALLTYPE JITCachedFunctionSearchStarted(FunctionID functionId, BOOL* pbUseCachedFunction) override; + // ICorProfilerInfo callbacks to track thread naming (used by ThreadSampler only) + HRESULT STDMETHODCALLTYPE ThreadCreated(ThreadID threadId) override; + HRESULT STDMETHODCALLTYPE ThreadDestroyed(ThreadID threadId) override; + HRESULT STDMETHODCALLTYPE ThreadNameChanged(ThreadID threadId, ULONG cchName, WCHAR name[]) override; + + // Needed for allocation sampling + HRESULT STDMETHODCALLTYPE EventPipeEventDelivered(EVENTPIPE_PROVIDER provider, + DWORD eventId, + DWORD eventVersion, + ULONG cbMetadataBlob, + LPCBYTE metadataBlob, + ULONG cbEventData, + LPCBYTE eventData, + LPCGUID pActivityId, + LPCGUID pRelatedActivityId, + ThreadID eventThread, + ULONG numStackFrames, + UINT_PTR stackFrames[]) override; + // // ICorProfilerCallback6 methods // @@ -181,6 +209,11 @@ class CorProfiler : public CorProfilerBase void AddInstrumentations(WCHAR* id, CallTargetDefinition* items, int size); void AddDerivedInstrumentations(WCHAR* id, CallTargetDefinition* items, int size); + // + // Continuous Profiler methods + // + void ConfigureContinuousProfiler(bool threadSamplingEnabled, unsigned int threadSamplingInterval, bool allocationSamplingEnabled, unsigned int maxMemorySamplesPerMinute); + friend class TracerMethodRewriter; }; diff --git a/src/OpenTelemetry.AutoInstrumentation.Native/cor_profiler_base.h b/src/OpenTelemetry.AutoInstrumentation.Native/cor_profiler_base.h index 570801df78..5257ff4775 100644 --- a/src/OpenTelemetry.AutoInstrumentation.Native/cor_profiler_base.h +++ b/src/OpenTelemetry.AutoInstrumentation.Native/cor_profiler_base.h @@ -20,6 +20,7 @@ class CorProfilerBase : public ICorProfilerCallback10 protected: ICorProfilerInfo7* info_; + ICorProfilerInfo12* info12_; public: CorProfilerBase(); diff --git a/src/OpenTelemetry.AutoInstrumentation.Native/interop.cpp b/src/OpenTelemetry.AutoInstrumentation.Native/interop.cpp index ad0c054b6e..f7d71c9b69 100644 --- a/src/OpenTelemetry.AutoInstrumentation.Native/interop.cpp +++ b/src/OpenTelemetry.AutoInstrumentation.Native/interop.cpp @@ -41,6 +41,15 @@ EXTERN_C VOID STDAPICALLTYPE AddDerivedInstrumentations(WCHAR* id, trace::CallTa return trace::profiler->AddDerivedInstrumentations(id, items, size); } +EXTERN_C VOID STDAPICALLTYPE ConfigureContinuousProfiler(bool threadSamplingEnabled, + unsigned int threadSamplingInterval, + bool allocationSamplingEnabled, + unsigned int maxMemorySamplesPerMinute) +{ + return trace::profiler->ConfigureContinuousProfiler(threadSamplingEnabled, threadSamplingInterval, + allocationSamplingEnabled, maxMemorySamplesPerMinute); +} + #ifndef _WIN32 EXTERN_C void* dddlopen(const char* __file, int __mode) { diff --git a/src/OpenTelemetry.AutoInstrumentation/ContinuousProfiler/BufferProcessor.cs b/src/OpenTelemetry.AutoInstrumentation/ContinuousProfiler/BufferProcessor.cs new file mode 100644 index 0000000000..6e5b23f306 --- /dev/null +++ b/src/OpenTelemetry.AutoInstrumentation/ContinuousProfiler/BufferProcessor.cs @@ -0,0 +1,66 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +#if NET6_0_OR_GREATER + +using System.Reflection; + +namespace OpenTelemetry.AutoInstrumentation.ContinuousProfiler; + +internal class BufferProcessor +{ + // If you change any of these constants, check with continuous_profiler.cpp first + private const int BufferSize = 200 * 1024; + + private readonly bool _threadSamplingEnabled; + private readonly bool _allocationSamplingEnabled; + private readonly object _continuousProfilerExporter; + private readonly MethodInfo _exportThreadSamplesMethod; + private readonly MethodInfo _exportAllocationSamplesMethod; + private readonly byte[] _buffer = new byte[BufferSize]; + + public BufferProcessor(bool threadSamplingEnabled, bool allocationSamplingEnabled, object continuousProfilerExporter, MethodInfo exportThreadSamplesMethod, MethodInfo exportAllocationSamplesMethod) + { + _threadSamplingEnabled = threadSamplingEnabled; + _allocationSamplingEnabled = allocationSamplingEnabled; + _continuousProfilerExporter = continuousProfilerExporter; + _exportThreadSamplesMethod = exportThreadSamplesMethod; + _exportAllocationSamplesMethod = exportAllocationSamplesMethod; + } + + public void Process() + { + if (_threadSamplingEnabled) + { + ProcessThreadSamples(); + } + + if (_allocationSamplingEnabled) + { + ProcessAllocationSamples(); + } + } + + private void ProcessThreadSamples() + { + var read = NativeMethods.ContinuousProfilerReadThreadSamples(_buffer.Length, _buffer); + if (read <= 0) + { + return; + } + + _exportThreadSamplesMethod.Invoke(_continuousProfilerExporter, new object[] { _buffer, read }); + } + + private void ProcessAllocationSamples() + { + var read = NativeMethods.ContinuousProfilerReadAllocationSamples(_buffer.Length, _buffer); + if (read <= 0) + { + return; + } + + _exportAllocationSamplesMethod.Invoke(_continuousProfilerExporter, new object[] { _buffer, read }); + } +} +#endif diff --git a/src/OpenTelemetry.AutoInstrumentation/ContinuousProfiler/ContinuousProfilerProcessor.cs b/src/OpenTelemetry.AutoInstrumentation/ContinuousProfiler/ContinuousProfilerProcessor.cs new file mode 100644 index 0000000000..2f2a4899fb --- /dev/null +++ b/src/OpenTelemetry.AutoInstrumentation/ContinuousProfiler/ContinuousProfilerProcessor.cs @@ -0,0 +1,82 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +#if NET6_0_OR_GREATER + +using System.Globalization; +using OpenTelemetry.AutoInstrumentation.Logging; + +namespace OpenTelemetry.AutoInstrumentation.ContinuousProfiler; + +internal static class ContinuousProfilerProcessor +{ + public const string BackgroundThreadName = "OpenTelemetry Continuous Profiler Thread"; + + private static readonly IOtelLogger Logger = OtelLogging.GetLogger(); + + public static void Activity_CurrentChanged(object? sender, System.Diagnostics.ActivityChangedEventArgs e) + { + var currentActivity = e.Current; + var managedThreadId = Environment.CurrentManagedThreadId; + + if (currentActivity != null) + { + var hexTraceId = currentActivity.TraceId.ToHexString(); + + if (ulong.TryParse(hexTraceId.AsSpan(0, 16), NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var traceIdHigh) && + ulong.TryParse(hexTraceId.AsSpan(16), NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var traceIdLow) && + ulong.TryParse(currentActivity.SpanId.ToHexString(), NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var spanId)) + { + NativeMethods.ContinuousProfilerSetNativeContext(traceIdHigh, traceIdLow, spanId, managedThreadId); + return; + } + } + + NativeMethods.ContinuousProfilerSetNativeContext(0, 0, 0, managedThreadId); + } + + public static void Initialize(bool threadSamplingEnabled, bool allocationSamplingEnabled, TimeSpan exportInterval, object continuousProfilerExporter) + { + Logger.Debug("Initializing Continuous Profiler export thread."); + + var continuousProfilerExporterType = continuousProfilerExporter.GetType(); + var exportThreadSamplesMethod = continuousProfilerExporterType.GetMethod("ExportThreadSamples"); + + if (exportThreadSamplesMethod == null) + { + Logger.Warning("Exporter does not have ExportThreadSamples method. Continuous Profiler initialization failed."); + return; + } + + var exportAllocationSamplesMethod = continuousProfilerExporterType.GetMethod("ExportAllocationSamples"); + if (exportAllocationSamplesMethod == null) + { + Logger.Warning("Exporter does not have ExportAllocationSamples method. Continuous Profiler initialization failed."); + return; + } + + // TODO Graceful shutdown and Task.Delay https://github.com/open-telemetry/opentelemetry-dotnet-instrumentation/issues/3216 + var thread = new Thread(() => + { + SampleReadingThread(new BufferProcessor(threadSamplingEnabled, allocationSamplingEnabled, continuousProfilerExporter, exportThreadSamplesMethod, exportAllocationSamplesMethod), exportInterval); + }) + { + Name = BackgroundThreadName, + IsBackground = true + }; + thread.Start(); + + Logger.Information("Continuous Profiler export thread initialized."); + } + + private static void SampleReadingThread(BufferProcessor sampleExporter, TimeSpan exportInterval) + { + while (true) + { + // TODO Graceful shutdown and Task.Delay https://github.com/open-telemetry/opentelemetry-dotnet-instrumentation/issues/3216 + Thread.Sleep(exportInterval); + sampleExporter.Process(); + } + } +} +#endif diff --git a/src/OpenTelemetry.AutoInstrumentation/Instrumentation.cs b/src/OpenTelemetry.AutoInstrumentation/Instrumentation.cs index a576ccfceb..29643b58a0 100644 --- a/src/OpenTelemetry.AutoInstrumentation/Instrumentation.cs +++ b/src/OpenTelemetry.AutoInstrumentation/Instrumentation.cs @@ -1,7 +1,13 @@ // Copyright The OpenTelemetry Authors // SPDX-License-Identifier: Apache-2.0 +#if NET6_0_OR_GREATER +using System.Diagnostics; +#endif using OpenTelemetry.AutoInstrumentation.Configurations; +#if NET6_0_OR_GREATER +using OpenTelemetry.AutoInstrumentation.ContinuousProfiler; +#endif using OpenTelemetry.AutoInstrumentation.Diagnostics; using OpenTelemetry.AutoInstrumentation.Loading; using OpenTelemetry.AutoInstrumentation.Logging; @@ -79,6 +85,30 @@ public static void Initialize() _pluginManager = new PluginManager(GeneralSettings.Value); _pluginManager.Initializing(); +#if NET6_0_OR_GREATER + var profilerEnabled = GeneralSettings.Value.ProfilerEnabled; + var threadSamplingEnabled = false; + var allocationSamplingEnabled = false; + TimeSpan exportInterval = default; + object? continuousProfilerExporter = null; + + if (profilerEnabled) + { + (threadSamplingEnabled, var threadSamplingInterval, allocationSamplingEnabled, var maxMemorySamplesPerMinute, exportInterval, continuousProfilerExporter) = _pluginManager.GetFirstContinuousConfiguration(); + Logger.Debug($"Continuous profiling configuration: Thread sampling enabled: {threadSamplingEnabled}, thread sampling interval: {threadSamplingInterval}, allocation sampling enabled: {allocationSamplingEnabled}, max memory samples per minute: {maxMemorySamplesPerMinute}, export interval: {exportInterval}, continuous profiler exporter: {continuousProfilerExporter.GetType()}"); + + if (threadSamplingEnabled || allocationSamplingEnabled) + { + NativeMethods.ConfigureNativeContinuousProfiler(threadSamplingEnabled, threadSamplingInterval, allocationSamplingEnabled, maxMemorySamplesPerMinute); + Activity.CurrentChanged += ContinuousProfilerProcessor.Activity_CurrentChanged; + } + } + else + { + Logger.Information("CLR Profiler is not enabled. Continuous Profiler will be not started even if configured correctly."); + } +#endif + if (TracerSettings.Value.TracesEnabled || MetricSettings.Value.MetricsEnabled) { // Register to shutdown events @@ -137,6 +167,12 @@ public static void Initialize() Logger.Information("Initialized lazily-loaded metric instrumentations without initializing sdk."); } } +#if NET6_0_OR_GREATER + if (profilerEnabled && (threadSamplingEnabled || allocationSamplingEnabled)) + { + ContinuousProfilerProcessor.Initialize(threadSamplingEnabled, allocationSamplingEnabled, exportInterval, continuousProfilerExporter!); + } +#endif } catch (Exception ex) { diff --git a/src/OpenTelemetry.AutoInstrumentation/NativeMethods.cs b/src/OpenTelemetry.AutoInstrumentation/NativeMethods.cs index 8c4cded0f5..0486d56013 100644 --- a/src/OpenTelemetry.AutoInstrumentation/NativeMethods.cs +++ b/src/OpenTelemetry.AutoInstrumentation/NativeMethods.cs @@ -43,6 +43,42 @@ public static void AddDerivedInstrumentations(string id, NativeCallTargetDefinit } } +#if NET6_0_OR_GREATER + public static void ConfigureNativeContinuousProfiler(bool threadSamplingEnabled, uint threadSamplingInterval, bool allocationSamplingEnabled, uint maxMemorySamplesPerMinute) + { + if (IsWindows) + { + Windows.ConfigureContinuousProfiler(threadSamplingEnabled, threadSamplingInterval, allocationSamplingEnabled, maxMemorySamplesPerMinute); + } + else + { + NonWindows.ConfigureContinuousProfiler(threadSamplingEnabled, threadSamplingInterval, allocationSamplingEnabled, maxMemorySamplesPerMinute); + } + } + + public static int ContinuousProfilerReadThreadSamples(int len, byte[] buf) + { + return IsWindows ? Windows.ContinuousProfilerReadThreadSamples(len, buf) : NonWindows.ContinuousProfilerReadThreadSamples(len, buf); + } + + public static int ContinuousProfilerReadAllocationSamples(int len, byte[] buf) + { + return IsWindows ? Windows.ContinuousProfilerReadAllocationSamples(len, buf) : NonWindows.ContinuousProfilerReadAllocationSamples(len, buf); + } + + public static void ContinuousProfilerSetNativeContext(ulong traceIdHigh, ulong traceIdLow, ulong spanId, int managedThreadId) + { + if (IsWindows) + { + Windows.ContinuousProfilerSetNativeContext(traceIdHigh, traceIdLow, spanId, managedThreadId); + } + else + { + NonWindows.ContinuousProfilerSetNativeContext(traceIdHigh, traceIdLow, spanId, managedThreadId); + } + } +#endif + // the "dll" extension is required on .NET Framework // and optional on .NET Core private static class Windows @@ -52,6 +88,21 @@ private static class Windows [DllImport("OpenTelemetry.AutoInstrumentation.Native.dll")] public static extern void AddDerivedInstrumentations([MarshalAs(UnmanagedType.LPWStr)] string id, [In] NativeCallTargetDefinition[] methodArrays, int size); + + [DllImport("OpenTelemetry.AutoInstrumentation.Native.dll")] + public static extern void ConfigureContinuousProfiler(bool threadSamplingEnabled, uint threadSamplingInterval, bool allocationSamplingEnabled, uint maxMemorySamplesPerMinute); + +#if NET6_0_OR_GREATER + + [DllImport("OpenTelemetry.AutoInstrumentation.Native.dll")] + public static extern int ContinuousProfilerReadThreadSamples(int len, byte[] buf); + + [DllImport("OpenTelemetry.AutoInstrumentation.Native.dll")] + public static extern int ContinuousProfilerReadAllocationSamples(int len, byte[] buf); + + [DllImport("OpenTelemetry.AutoInstrumentation.Native.dll")] + public static extern void ContinuousProfilerSetNativeContext(ulong traceIdHigh, ulong traceIdLow, ulong spanId, int managedThreadId); +#endif } // assume .NET Core if not running on Windows @@ -62,5 +113,19 @@ private static class NonWindows [DllImport("OpenTelemetry.AutoInstrumentation.Native")] public static extern void AddDerivedInstrumentations([MarshalAs(UnmanagedType.LPWStr)] string id, [In] NativeCallTargetDefinition[] methodArrays, int size); + + [DllImport("OpenTelemetry.AutoInstrumentation.Native")] + public static extern void ConfigureContinuousProfiler(bool threadSamplingEnabled, uint threadSamplingInterval, bool allocationSamplingEnabled, uint maxMemorySamplesPerMinute); + +#if NET6_0_OR_GREATER + [DllImport("OpenTelemetry.AutoInstrumentation.Native")] + public static extern int ContinuousProfilerReadThreadSamples(int len, byte[] buf); + + [DllImport("OpenTelemetry.AutoInstrumentation.Native")] + public static extern int ContinuousProfilerReadAllocationSamples(int len, byte[] buf); + + [DllImport("OpenTelemetry.AutoInstrumentation.Native")] + public static extern void ContinuousProfilerSetNativeContext(ulong traceIdHigh, ulong traceIdLow, ulong spanId, int managedThreadId); +#endif } } diff --git a/src/OpenTelemetry.AutoInstrumentation/Plugins/PluginManager.ContinuousProfiler.cs b/src/OpenTelemetry.AutoInstrumentation/Plugins/PluginManager.ContinuousProfiler.cs new file mode 100644 index 0000000000..e504712cdd --- /dev/null +++ b/src/OpenTelemetry.AutoInstrumentation/Plugins/PluginManager.ContinuousProfiler.cs @@ -0,0 +1,24 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +#if NET6_0_OR_GREATER + +namespace OpenTelemetry.AutoInstrumentation.Plugins; + +internal partial class PluginManager +{ + public (bool ThreadSamplingEnabled, uint ThreadSamplingInterval, bool AllocationSamplingEnabled, uint MaxMemorySamplesPerMinute, TimeSpan ExportInterval, object ContinuousProfilerExporter) GetFirstContinuousConfiguration() + { + foreach (var plugin in _plugins) + { + var mi = plugin.Type.GetMethod("GetContinuousProfilerConfiguration"); + if (mi?.Invoke(plugin.Instance, null) is Tuple continuousProfilerConfiguration) + { + return (continuousProfilerConfiguration.Item1, continuousProfilerConfiguration.Item2, continuousProfilerConfiguration.Item3, continuousProfilerConfiguration.Item4, continuousProfilerConfiguration.Item5, continuousProfilerConfiguration.Item6); + } + } + + return (false, 0u, false, 0u, TimeSpan.Zero, new object()); + } +} +#endif diff --git a/src/OpenTelemetry.AutoInstrumentation/Plugins/PluginManager.cs b/src/OpenTelemetry.AutoInstrumentation/Plugins/PluginManager.cs index 2874f4b64f..9327dccf75 100644 --- a/src/OpenTelemetry.AutoInstrumentation/Plugins/PluginManager.cs +++ b/src/OpenTelemetry.AutoInstrumentation/Plugins/PluginManager.cs @@ -9,7 +9,7 @@ namespace OpenTelemetry.AutoInstrumentation.Plugins; -internal class PluginManager +internal partial class PluginManager { private readonly IReadOnlyList<(Type Type, object Instance)> _plugins; diff --git a/test/IntegrationTests/ContinuousProfilerTests.cs b/test/IntegrationTests/ContinuousProfilerTests.cs index dc05ddbc23..611ebadb10 100644 --- a/test/IntegrationTests/ContinuousProfilerTests.cs +++ b/test/IntegrationTests/ContinuousProfilerTests.cs @@ -3,6 +3,7 @@ #if NET6_0_OR_GREATER +using FluentAssertions; using IntegrationTests.Helpers; using Xunit.Abstractions; @@ -17,9 +18,60 @@ public ContinuousProfilerTests(ITestOutputHelper output) [Fact] [Trait("Category", "EndToEnd")] - public void ProfilerTestApplicationExecutesWithoutErrors() + public void ExportAllocationSamples() { - RunTestApplication(); + EnableBytecodeInstrumentation(); + SetEnvironmentVariable("OTEL_DOTNET_AUTO_PLUGINS", "TestApplication.ContinuousProfiler.AllocationPlugin, TestApplication.ContinuousProfiler, Version=1.0.0.0, Culture=neutral, PublicKeyToken=null"); + SetEnvironmentVariable("OTEL_DOTNET_AUTO_TRACES_ADDITIONAL_SOURCES", "TestApplication.ContinuousProfiler"); + var (standardOutput, _, _) = RunTestApplication(); + + standardOutput.Should().Contain(@"ExportAllocationSamples["); + } + + [Fact] + [Trait("Category", "EndToEnd")] + public void ExportThreadSamples() + { + EnableBytecodeInstrumentation(); + SetEnvironmentVariable("OTEL_DOTNET_AUTO_PLUGINS", "TestApplication.ContinuousProfiler.ThreadPlugin, TestApplication.ContinuousProfiler, Version=1.0.0.0, Culture=neutral, PublicKeyToken=null"); + SetEnvironmentVariable("OTEL_DOTNET_AUTO_TRACES_ADDITIONAL_SOURCES", "TestApplication.ContinuousProfiler"); + var (standardOutput, _, _) = RunTestApplication(); + + var expectedStackTrace = string.Join(Environment.NewLine, CreateExpectedStackTrace()); + + standardOutput.Should().Contain(expectedStackTrace); + } + + private static List CreateExpectedStackTrace() + { + var stackTrace = new List + { + "System.Threading.Thread.Sleep(System.TimeSpan)", + "TestApplication.ContinuousProfiler.Fs.ClassFs.methodFs(System.String)", + "TestApplication.ContinuousProfiler.Vb.ClassVb.MethodVb(System.String)", + "My.Custom.Test.Namespace.TestDynamicClass.TryInvoke(System.Dynamic.InvokeBinder, System.Object[], System.Object\u0026)", + "System.Dynamic.UpdateDelegates.UpdateAndExecuteVoid3[T0, T1, T2](System.Runtime.CompilerServices.CallSite, T0, T1, T2)", + "My.Custom.Test.Namespace.ClassENonStandardCharacters\u0104\u0118\u00D3\u0141\u017B\u0179\u0106\u0105\u0119\u00F3\u0142\u017C\u017A\u015B\u0107\u011C\u0416\u13F3\u2CC4\u02A4\u01CB\u2093\u06BF\u0B1F\u0D10\u1250\u3023\u203F\u0A6E\u1FAD_\u00601.GenericMethodDFromGenericClass[TMethod, TMethod2](TClass, TMethod, TMethod2)", + "My.Custom.Test.Namespace.ClassD`21.MethodD(T01, T02, T03, T04, T05, T06, T07, T08, T09, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, Unknown)", + "My.Custom.Test.Namespace.GenericClassC`1.GenericMethodCFromGenericClass[T01, T02, T03, T04, T05, T06, T07, T08, T09, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20](T01, T02, T03, T04, T05, T06, T07, T08, T09, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, Unknown)", + "My.Custom.Test.Namespace.GenericClassC`1.GenericMethodCFromGenericClass(T)" + }; + + if (Environment.OSVersion.Platform != PlatformID.Win32NT) + { + stackTrace.Add("Unknown_Native_Function(unknown)"); + } + + stackTrace.Add("My.Custom.Test.Namespace.ClassA.InternalClassB`2.DoubleInternalClassB.TripleInternalClassB`1.MethodB[TB](System.Int32, TC[], TB, TD, System.Collections.Generic.IList`1[TA], System.Collections.Generic.IList`1[System.String])"); + stackTrace.Add("My.Custom.Test.Namespace.ClassA.g__Action|7_0[T](System.Int32)"); + stackTrace.Add("My.Custom.Test.Namespace.ClassA.MethodAOthers[T](System.String, System.Object, My.Custom.Test.Namespace.CustomClass, My.Custom.Test.Namespace.CustomStruct, My.Custom.Test.Namespace.CustomClass[], My.Custom.Test.Namespace.CustomStruct[], System.Collections.Generic.List`1[T])"); + stackTrace.Add("My.Custom.Test.Namespace.ClassA.MethodAPointer(System.Int32*)"); + stackTrace.Add("My.Custom.Test.Namespace.ClassA.MethodAFloats(System.Single, System.Double)"); + stackTrace.Add("My.Custom.Test.Namespace.ClassA.MethodAInts(System.UInt16, System.Int16, System.UInt32, System.Int32, System.UInt64, System.Int64, System.IntPtr, System.UIntPtr)"); + stackTrace.Add("My.Custom.Test.Namespace.ClassA.MethodABytes(System.Boolean, System.Char, System.SByte, System.Byte)"); + stackTrace.Add("My.Custom.Test.Namespace.ClassA.MethodA()"); + + return stackTrace; } } #endif diff --git a/test/IntegrationTests/Helpers/InstrumentedProcessHelper.cs b/test/IntegrationTests/Helpers/InstrumentedProcessHelper.cs index 2d20b21ddd..108dd9aafe 100644 --- a/test/IntegrationTests/Helpers/InstrumentedProcessHelper.cs +++ b/test/IntegrationTests/Helpers/InstrumentedProcessHelper.cs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 using System.Diagnostics; +using System.Text; namespace IntegrationTests.Helpers; @@ -23,6 +24,7 @@ public class InstrumentedProcessHelper startInfo.RedirectStandardOutput = true; startInfo.RedirectStandardError = true; startInfo.RedirectStandardInput = false; + startInfo.StandardOutputEncoding = Encoding.Default; return Process.Start(startInfo); } diff --git a/test/test-applications/integrations/TestApplication.ContinuousProfiler/ClassA.cs b/test/test-applications/integrations/TestApplication.ContinuousProfiler/ClassA.cs index 8ff2ea632a..8c15df0719 100644 --- a/test/test-applications/integrations/TestApplication.ContinuousProfiler/ClassA.cs +++ b/test/test-applications/integrations/TestApplication.ContinuousProfiler/ClassA.cs @@ -1,6 +1,7 @@ // Copyright The OpenTelemetry Authors // SPDX-License-Identifier: Apache-2.0 +using System.Runtime.CompilerServices; using System.Runtime.InteropServices; namespace My.Custom.Test.Namespace; @@ -11,15 +12,29 @@ internal static class ClassA private delegate int Callback(int n); + [MethodImpl(MethodImplOptions.NoInlining)] public static void MethodA() { + const int numberOfItems = 1024; + var items = new List(); + for (var i = 0; i < numberOfItems; i++) + { + items.Add(i.ToString("D10000")); + } + MethodABytes( false, '\0', sbyte.MaxValue, byte.MaxValue); + + for (var i = 0; i < numberOfItems; i++) + { + TextWriter.Null.Write(items[i][^2]); + } } + [MethodImpl(MethodImplOptions.NoInlining)] public static void MethodABytes( bool b, char c, @@ -37,6 +52,7 @@ public static void MethodABytes( nuint.MaxValue); } + [MethodImpl(MethodImplOptions.NoInlining)] public static void MethodAInts( ushort ui16, short i16, @@ -50,6 +66,7 @@ public static void MethodAInts( MethodAFloats(float.MaxValue, double.MaxValue); } + [MethodImpl(MethodImplOptions.NoInlining)] public static unsafe void MethodAFloats( float fl, double db) @@ -59,6 +76,7 @@ public static unsafe void MethodAFloats( MethodAPointer(pointer); } + [MethodImpl(MethodImplOptions.NoInlining)] public static unsafe void MethodAPointer(int* pointer) { MethodAOthers( @@ -71,6 +89,7 @@ public static unsafe void MethodAPointer(int* pointer) new List()); } + [MethodImpl(MethodImplOptions.NoInlining)] public static void MethodAOthers( string s, object obj, diff --git a/test/test-applications/integrations/TestApplication.ContinuousProfiler/Exporter/AllocationSample.cs b/test/test-applications/integrations/TestApplication.ContinuousProfiler/Exporter/AllocationSample.cs new file mode 100644 index 0000000000..4b06cee29f --- /dev/null +++ b/test/test-applications/integrations/TestApplication.ContinuousProfiler/Exporter/AllocationSample.cs @@ -0,0 +1,20 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +namespace TestApplication.ContinuousProfiler; + +internal class AllocationSample +{ + public AllocationSample(long allocationSizeBytes, string allocationTypeName, ThreadSample threadSample) + { + AllocationSizeBytes = allocationSizeBytes; + TypeName = allocationTypeName; + ThreadSample = threadSample; + } + + public long AllocationSizeBytes { get; } + + public string TypeName { get; } + + public ThreadSample ThreadSample { get; } +} diff --git a/test/test-applications/integrations/TestApplication.ContinuousProfiler/Exporter/ConsoleExporter.cs b/test/test-applications/integrations/TestApplication.ContinuousProfiler/Exporter/ConsoleExporter.cs new file mode 100644 index 0000000000..048a9352e4 --- /dev/null +++ b/test/test-applications/integrations/TestApplication.ContinuousProfiler/Exporter/ConsoleExporter.cs @@ -0,0 +1,33 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +using System.Text; + +namespace TestApplication.ContinuousProfiler; + +public class ConsoleExporter +{ + static ConsoleExporter() + { + Console.OutputEncoding = Encoding.Default; + } + + public void ExportThreadSamples(byte[] buffer, int read) + { + var threadSamples = SampleNativeFormatParser.ParseThreadSamples(buffer, read); + + Console.WriteLine("ExportThreadSamples: " + string.Join("\n", threadSamples!.First().Frames)); + } + + public void ExportAllocationSamples(byte[] buffer, int read) + { + var allocationSamples = SampleNativeFormatParser.ParseAllocationSamples(buffer, read); + var allocationSample = allocationSamples.FirstOrDefault(); + if (allocationSample == null) + { + return; + } + + Console.WriteLine($"ExportAllocationSamples[{allocationSample.AllocationSizeBytes}]{string.Join("\n", allocationSample.ThreadSample.Frames)}"); + } +} diff --git a/test/test-applications/integrations/TestApplication.ContinuousProfiler/Exporter/SampleNativeFormatParser.cs b/test/test-applications/integrations/TestApplication.ContinuousProfiler/Exporter/SampleNativeFormatParser.cs new file mode 100644 index 0000000000..e38e2702d1 --- /dev/null +++ b/test/test-applications/integrations/TestApplication.ContinuousProfiler/Exporter/SampleNativeFormatParser.cs @@ -0,0 +1,314 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +using System.Text; + +namespace TestApplication.ContinuousProfiler; + +/// +/// Parser the native code's pause-time-optimized format. +/// +internal static class SampleNativeFormatParser +{ + // TODO use value from ContinuousProfilerProcessor.BackgroundThreadName when it will be moved to main project + public const string BackgroundThreadName = "OpenTelemetry Continuous Profiler Thread"; + + private static readonly UnicodeEncoding UnicodeEncoding = new(); + + /// + /// Parses the thread sample batch. + /// + /// byte array containing native thread samples format data + /// how much of the buffer is actually used + internal static List? ParseThreadSamples(byte[] buffer, int read) + { + uint batchThreadIndex = 0; + var samples = new List(); + long sampleStartMillis = 0; + + var position = 0; + + // common for samples in a batch + var codeDictionary = new Dictionary(); + + try + { + while (position < read) + { + var operationCode = buffer[position]; + position++; + if (operationCode == OpCodes.StartBatch) + { + var version = ReadInt(buffer, ref position); + if (version != 1) + { + return null; // not able to parse + } + + sampleStartMillis = ReadInt64(buffer, ref position); + + // TODO Debug logs to measure overhead https://github.com/open-telemetry/opentelemetry-dotnet-instrumentation/issues/3205 + /* if (IsLogLevelDebugEnabled) + { + var sampleStart = new DateTime( + (sampleStartMillis * TimeSpan.TicksPerMillisecond) + TimeConstants.UnixEpochInTicks).ToLocalTime(); + Log.Debug( + "Parsing thread samples captured at {date} {time}", + sampleStart.ToLongDateString(), + sampleStart.ToLongTimeString()); + }*/ + } + else if (operationCode == OpCodes.StartSample) + { + var managedId = ReadInt(buffer, ref position); + var threadName = ReadString(buffer, ref position); + var traceIdHigh = ReadInt64(buffer, ref position); + var traceIdLow = ReadInt64(buffer, ref position); + var spanId = ReadInt64(buffer, ref position); + + var threadIndex = batchThreadIndex++; + + var code = ReadShort(buffer, ref position); + if (code == 0) + { + // Empty stack, skip this sample. + continue; + } + + var threadSample = new ThreadSample + { + Timestamp = new ThreadSample.Time(sampleStartMillis), + TraceIdHigh = traceIdHigh, + TraceIdLow = traceIdLow, + SpanId = spanId, + ManagedId = managedId, + ThreadName = threadName, + ThreadIndex = threadIndex + }; + + ReadStackFrames(code, threadSample, codeDictionary, buffer, ref position); + + if (threadName == BackgroundThreadName) + { + // TODO add configuration option to include the sampler thread. By default remove it. + continue; + } + + samples.Add(threadSample); + } + else if (operationCode == OpCodes.EndBatch) + { + // end batch, nothing here + } + else if (operationCode == OpCodes.BatchStats) + { + var microsSuspended = ReadInt(buffer, ref position); + var numThreads = ReadInt(buffer, ref position); + var totalFrames = ReadInt(buffer, ref position); + var numCacheMisses = ReadInt(buffer, ref position); + + // TODO Debug logs to measure overhead https://github.com/open-telemetry/opentelemetry-dotnet-instrumentation/issues/3205 + /*if (IsLogLevelDebugEnabled) + { + Log.Debug( + "CLR was suspended for {microsSuspended} microseconds to collect a thread sample batch: threads={numThreads} frames={totalFrames} misses={numCacheMisses}", + new object[] { microsSuspended, numThreads, totalFrames, numCacheMisses }); + }*/ + } + else + { + position = read + 1; + + // TODO Debug log to handle unexpected buffer https://github.com/open-telemetry/opentelemetry-dotnet-instrumentation/issues/3205 + /* if (IsLogLevelDebugEnabled) + { + Log.Debug("Not expected operation code while parsing thread stack trace: '{0}'. Operation will be ignored.", operationCode); + } */ + } + } + } + catch (Exception e) + { + Console.WriteLine(e + "Unexpected error while parsing thread samples."); + } + + return samples; + } + + /// + /// Parses the allocation sample batch. + /// + /// byte array containing native allocation samples format data + /// how much of the buffer is actually used + internal static List ParseAllocationSamples(byte[] buffer, int read) + { + var allocationSamples = new List(); + var position = 0; + + try + { + while (position < read) + { + var operationCode = buffer[position++]; + + if (operationCode == OpCodes.AllocationSample) + { + var timestampMillis = ReadInt64(buffer, ref position); + var allocatedSize = ReadInt64(buffer, ref position); // Technically uint64 but whatever + var typeName = ReadString(buffer, ref position); + var managedId = ReadInt(buffer, ref position); + var threadName = ReadString(buffer, ref position); + var traceIdHigh = ReadInt64(buffer, ref position); + var traceIdLow = ReadInt64(buffer, ref position); + var spanId = ReadInt64(buffer, ref position); + + var threadSample = new ThreadSample + { + Timestamp = new ThreadSample.Time(timestampMillis), + TraceIdHigh = traceIdHigh, + TraceIdLow = traceIdLow, + SpanId = spanId, + ManagedId = managedId, + ThreadName = threadName + }; + + var code = ReadShort(buffer, ref position); + + // each allocation sample has independently coded strings + var codeDictionary = new Dictionary(); + + ReadStackFrames(code, threadSample, codeDictionary, buffer, ref position); + if (threadName == BackgroundThreadName) + { + // TODO: add configuration option to include the sampler thread. By default remove it. + continue; + } + + allocationSamples.Add(new AllocationSample(allocatedSize, typeName, threadSample)); + } + else + { + position = read + 1; + + /* if (IsLogLevelDebugEnabled) + { + Log.Debug("Not expected operation code while parsing allocation sample: '{0}'. Operation will be ignored.", operationCode); + } */ + } + } + } + catch (Exception e) + { + Console.WriteLine(e + "Unexpected error while parsing allocation samples."); + } + + return allocationSamples; + } + + private static string ReadString(byte[] buffer, ref int position) + { + var length = ReadShort(buffer, ref position); + var s = UnicodeEncoding.GetString(buffer, position, length * 2); + position += 2 * length; + return s; + } + + private static short ReadShort(byte[] buffer, ref int position) + { + var s1 = (short)(buffer[position] & 0xFF); + s1 <<= 8; + var s2 = (short)(buffer[position + 1] & 0xFF); + position += 2; + return (short)(s1 + s2); + } + + private static int ReadInt(byte[] buffer, ref int position) + { + var i1 = buffer[position] & 0xFF; + i1 <<= 24; + var i2 = buffer[position + 1] & 0xFF; + i2 <<= 16; + var i3 = buffer[position + 2] & 0xFF; + i3 <<= 8; + var i4 = buffer[position + 3] & 0xFF; + position += 4; + return i1 + i2 + i3 + i4; + } + + private static long ReadInt64(byte[] buffer, ref int position) + { + long l1 = buffer[position] & 0xFF; + l1 <<= 56; + long l2 = buffer[position + 1] & 0xFF; + l2 <<= 48; + long l3 = buffer[position + 2] & 0xFF; + l3 <<= 40; + long l4 = buffer[position + 3] & 0xFF; + l4 <<= 32; + long l5 = buffer[position + 4] & 0xFF; + l5 <<= 24; + long l6 = buffer[position + 5] & 0xFF; + l6 <<= 16; + long l7 = buffer[position + 6] & 0xFF; + l7 <<= 8; + long l8 = buffer[position + 7] & 0xFF; + position += 8; + return l1 + l2 + l3 + l4 + l5 + l6 + l7 + l8; + } + + /// + /// Reads stack frames until 0 (no more frames) is encountered + /// + private static void ReadStackFrames(short code, ThreadSample threadSample, Dictionary dictionary, byte[] buffer, ref int position) + { + while (code != 0) + { + string? value; + if (code < 0) + { + value = ReadString(buffer, ref position); + + dictionary[-code] = value; + } + else + { + value = dictionary[code]; + } + + if (value != null) + { + threadSample.Frames.Add(value); + } + + code = ReadShort(buffer, ref position); + } + } + + private static class OpCodes + { + /// + /// Marks the start of a batch of thread samples, see THREAD_SAMPLES_START_BATCH on native code. + /// + public const byte StartBatch = 0x01; + + /// + /// Marks the start of a thread sample, see THREAD_SAMPLES_START_SAMPLE on native code. + /// + public const byte StartSample = 0x02; + + /// + /// Marks the end of a batch of thread samples, see THREAD_SAMPLES_END_BATCH on native code. + /// + public const byte EndBatch = 0x06; + + /// + /// Marks the beginning of a section with statistics, see THREAD_SAMPLES_FINAL_STATS on native code. + /// + public const byte BatchStats = 0x07; + + /// + /// Marks the start of an allocation sample, see THREAD_SAMPLES_ALLOCATION_SAMPLE on native code. + /// + public const byte AllocationSample = 0x08; + } +} diff --git a/test/test-applications/integrations/TestApplication.ContinuousProfiler/Exporter/ThreadSample.cs b/test/test-applications/integrations/TestApplication.ContinuousProfiler/Exporter/ThreadSample.cs new file mode 100644 index 0000000000..f65c2465f7 --- /dev/null +++ b/test/test-applications/integrations/TestApplication.ContinuousProfiler/Exporter/ThreadSample.cs @@ -0,0 +1,36 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +namespace TestApplication.ContinuousProfiler; + +internal class ThreadSample +{ + public Time? Timestamp { get; set; } + + public long SpanId { get; set; } + + public long TraceIdHigh { get; set; } + + public long TraceIdLow { get; set; } + + public int ManagedId { get; set; } + + public string? ThreadName { get; set; } + + public uint ThreadIndex { get; set; } + + public IList Frames { get; } = new List(); + + internal class Time + { + public Time(long milliseconds) + { + Milliseconds = milliseconds; + Nanoseconds = (ulong)milliseconds * 1_000_000u; + } + + public ulong Nanoseconds { get; } + + public long Milliseconds { get; } + } +} diff --git a/test/test-applications/integrations/TestApplication.ContinuousProfiler/Plugins/AllocationPlugin.cs b/test/test-applications/integrations/TestApplication.ContinuousProfiler/Plugins/AllocationPlugin.cs new file mode 100644 index 0000000000..4001942f4e --- /dev/null +++ b/test/test-applications/integrations/TestApplication.ContinuousProfiler/Plugins/AllocationPlugin.cs @@ -0,0 +1,19 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +namespace TestApplication.ContinuousProfiler; + +public class AllocationPlugin +{ + public Tuple GetContinuousProfilerConfiguration() + { + var threadSamplingEnabled = false; + var threadSamplingInterval = 1000u; + var allocationSamplingEnabled = true; + var maxMemorySamplesPerMinute = 200u; + var exportInterval = TimeSpan.FromMilliseconds(500); + object continuousProfilerExporter = new ConsoleExporter(); + + return Tuple.Create(threadSamplingEnabled, threadSamplingInterval, allocationSamplingEnabled, maxMemorySamplesPerMinute, exportInterval, continuousProfilerExporter); + } +} diff --git a/test/test-applications/integrations/TestApplication.ContinuousProfiler/Plugins/ThreadPlugin.cs b/test/test-applications/integrations/TestApplication.ContinuousProfiler/Plugins/ThreadPlugin.cs new file mode 100644 index 0000000000..59e9b11ce9 --- /dev/null +++ b/test/test-applications/integrations/TestApplication.ContinuousProfiler/Plugins/ThreadPlugin.cs @@ -0,0 +1,19 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +namespace TestApplication.ContinuousProfiler; + +public class ThreadPlugin +{ + public Tuple GetContinuousProfilerConfiguration() + { + var threadSamplingEnabled = true; + var threadSamplingInterval = 1000u; + var allocationSamplingEnabled = false; + var maxMemorySamplesPerMinute = 200u; + var exportInterval = TimeSpan.FromMilliseconds(500); + object continuousProfilerExporter = new ConsoleExporter(); + + return Tuple.Create(threadSamplingEnabled, threadSamplingInterval, allocationSamplingEnabled, maxMemorySamplesPerMinute, exportInterval, continuousProfilerExporter); + } +} diff --git a/test/test-applications/integrations/TestApplication.ContinuousProfiler/Program.cs b/test/test-applications/integrations/TestApplication.ContinuousProfiler/Program.cs index a6d3aa634d..4f8e14f0b8 100644 --- a/test/test-applications/integrations/TestApplication.ContinuousProfiler/Program.cs +++ b/test/test-applications/integrations/TestApplication.ContinuousProfiler/Program.cs @@ -1,6 +1,10 @@ // Copyright The OpenTelemetry Authors // SPDX-License-Identifier: Apache-2.0 +using System.Diagnostics; using My.Custom.Test.Namespace; +ActivitySource activitySource = new("TestApplication.ContinuousProfiler", "1.0.0"); + +using var activity = activitySource.StartActivity(); ClassA.MethodA(); diff --git a/test/test-applications/integrations/TestApplication.ContinuousProfiler/Properties/launchSettings.json b/test/test-applications/integrations/TestApplication.ContinuousProfiler/Properties/launchSettings.json new file mode 100644 index 0000000000..98e335ecdc --- /dev/null +++ b/test/test-applications/integrations/TestApplication.ContinuousProfiler/Properties/launchSettings.json @@ -0,0 +1,20 @@ +{ + "profiles": { + "instrumented": { + "commandName": "Project", + "dotnetRunMessages": true, + "environmentVariables": { + "DOTNET_ENVIRONMENT": "Development", + "CORECLR_ENABLE_PROFILING": "1", + "CORECLR_PROFILER": "{918728DD-259F-4A6A-AC2B-B85E1B658318}", + "CORECLR_PROFILER_PATH": "$(SolutionDir)bin\\tracer-home\\win-x64\\OpenTelemetry.AutoInstrumentation.Native.dll", + "DOTNET_ADDITIONAL_DEPS": "$(SolutionDir)bin\\tracer-home\\AdditionalDeps", + "DOTNET_SHARED_STORE": "$(SolutionDir)bin\\tracer-home\\store", + "DOTNET_STARTUP_HOOKS": "$(SolutionDir)bin\\tracer-home\\net\\OpenTelemetry.AutoInstrumentation.StartupHook.dll", + "OTEL_DOTNET_AUTO_HOME": "$(SolutionDir)bin\\tracer-home", + "OTEL_DOTNET_AUTO_PLUGINS": "TestApplication.ContinuousProfiler.ThreadPlugin, TestApplication.ContinuousProfiler, Version=1.0.0.0, Culture=neutral, PublicKeyToken=null", // or "TestApplication.ContinuousProfiler.AllocationPlugin, TestApplication.ContinuousProfiler, Version=1.0.0.0, Culture=neutral, PublicKeyToken=null" for allocation testing purposes + "OTEL_DOTNET_AUTO_TRACES_ADDITIONAL_SOURCES": "TestApplication.ContinuousProfiler" + } + } + } +} diff --git a/test/test-applications/integrations/TestApplication.ContinuousProfiler/TestApplication.ContinuousProfiler.csproj b/test/test-applications/integrations/TestApplication.ContinuousProfiler/TestApplication.ContinuousProfiler.csproj index 327852e4fc..776c801def 100644 --- a/test/test-applications/integrations/TestApplication.ContinuousProfiler/TestApplication.ContinuousProfiler.csproj +++ b/test/test-applications/integrations/TestApplication.ContinuousProfiler/TestApplication.ContinuousProfiler.csproj @@ -24,4 +24,8 @@ + + + + diff --git a/test/test-applications/integrations/dependency-libs/TestApplication.ContinuousProfiler.Fs/ClassFs.fs b/test/test-applications/integrations/dependency-libs/TestApplication.ContinuousProfiler.Fs/ClassFs.fs index 38bc05cddf..6cdedec563 100644 --- a/test/test-applications/integrations/dependency-libs/TestApplication.ContinuousProfiler.Fs/ClassFs.fs +++ b/test/test-applications/integrations/dependency-libs/TestApplication.ContinuousProfiler.Fs/ClassFs.fs @@ -9,5 +9,5 @@ open System.Threading; module ClassFs = let methodFs testParam = Console.WriteLine("Thread.Sleep - starting " + testParam) - Thread.Sleep(TimeSpan.FromSeconds(2)) // Give a chance for the continuous profiler to collect a profile. + Thread.Sleep(TimeSpan.FromSeconds(5)) // Give a chance for the continuous profiler to collect a profile. Console.WriteLine("Thread.Sleep - finished")