Skip to content

Commit

Permalink
Tid translation for target processes running in a separate namespace. (
Browse files Browse the repository at this point in the history
…google#4195)

If the target process is running in a separate namespace, thread ids
(tids) observed from within the target process are different from the
tids observed in the root namespace.

This is a problem for events we obtain from the target process. These
are userspace instrumentation (usi), manual instrumentation and the Vulkan
layer. This PR is only about usi.

Usi events are obtained for the game, received in
ProducerEventProcessorHijackingFunctionEntryExitForLinuxTracing and
piped back into the LinuxTracing code. Finally they are processed by
UprobesUnwindingVisitor where they are used to emit FunctionCall's and
to help with the call stack handling.

The UprobesUnwindingVisitor needs to translate the tids from inside the
UserSpaceFunction{Entry,Exit}PerfEventData events into root namespace
tids before handling them.

For that purpose we have the UprobesUnwindingVisitor observe the clone{3}
exit tracepoints, the task_newtask tracepoint. By that we are able to
keep track of new threads and their tids. Additionally, at the beginning of
the capture, we also send the initial state of the tid mapping. We
obtain this information from the /proc filesystem.
  • Loading branch information
danielfenner authored Sep 19, 2022
1 parent 10593d0 commit 0cff3f5
Show file tree
Hide file tree
Showing 13 changed files with 396 additions and 17 deletions.
6 changes: 6 additions & 0 deletions src/LinuxTracing/KernelTracepoints.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,4 +105,10 @@ struct __attribute__((__packed__)) dma_fence_signaled_tracepoint {
uint32_t seqno;
};

struct __attribute__((__packed__)) syscall_exit_tracepoint {
tracepoint_common common;
uint64_t syscall_nr;
uint64_t ret;
};

#endif // LINUX_TRACING_KERNEL_TRACEPOINTS_H_
34 changes: 34 additions & 0 deletions src/LinuxTracing/LinuxTracingUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "ModuleUtils/ReadLinuxMaps.h"
#include "ModuleUtils/VirtualAndAbsoluteAddresses.h"
#include "OrbitBase/ExecuteCommand.h"
#include "OrbitBase/GetProcessIds.h"
#include "OrbitBase/Logging.h"
#include "OrbitBase/ReadFileToString.h"
#include "OrbitBase/SafeStrerror.h"
Expand Down Expand Up @@ -333,4 +334,37 @@ std::map<uint64_t, std::string> FindFunctionsThatUprobesCannotInstrumentWithMess
return function_ids_to_error_messages;
}

absl::flat_hash_map<pid_t, pid_t> RetrieveInitialTidToRootNamespaceTidMapping(
pid_t pid_in_root_namespace) {
absl::flat_hash_map<pid_t, pid_t> tid_mapping;
for (pid_t tid_in_root_namespace : orbit_base::GetTidsOfProcess(pid_in_root_namespace)) {
const std::string status_file_name = absl::StrFormat("/proc/%d/status", tid_in_root_namespace);
auto reading_result = orbit_base::ReadFileToString(status_file_name);
if (reading_result.has_error()) {
// This means the thread exited before we were able to read the status file. It is fine to
// just skip this thread.
continue;
}
const std::vector<std::string> lines =
absl::StrSplit(reading_result.value(), '\n', absl::SkipEmpty());
for (std::string_view line : lines) {
if (!absl::StartsWith(line, "NSpid:")) continue;
// The line in the status file looks like this:
// NSpid: pid pid_1 ... pid_n
// where pid is the pid in the root namespace, pid_1 is the pid in the first nested namespace
// and pid_n is the pid in the innermost namespace.
const std::vector<std::string> splits =
absl::StrSplit(line, absl::ByAnyChar(": \t"), absl::SkipWhitespace{});
pid_t tid_in_target_process_namespace = 0;
if (!absl::SimpleAtoi(splits.back(), &tid_in_target_process_namespace)) {
ORBIT_ERROR("Line in %s starting with 'NSpid:' did not end with a pid. Entire line was: %s",
status_file_name, line);
break;
}
tid_mapping[tid_in_target_process_namespace] = tid_in_root_namespace;
}
}
return tid_mapping;
}

} // namespace orbit_linux_tracing
6 changes: 6 additions & 0 deletions src/LinuxTracing/LinuxTracingUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#ifndef LINUX_TRACING_LINUX_TRACING_UTILS_H_
#define LINUX_TRACING_LINUX_TRACING_UTILS_H_

#include <absl/container/flat_hash_map.h>
#include <unistd.h>

#include <ctime>
Expand Down Expand Up @@ -76,6 +77,11 @@ inline size_t GetPageSize() {
const std::vector<orbit_grpc_protos::ModuleInfo>& modules,
const std::vector<orbit_grpc_protos::InstrumentedFunction>& functions);

// Returns the map of tids in the target process namespace to the corresponding tids in the root
// namespace.
[[nodiscard]] absl::flat_hash_map<pid_t, pid_t> RetrieveInitialTidToRootNamespaceTidMapping(
pid_t pid_in_root_namespace);

} // namespace orbit_linux_tracing

#endif // LINUX_TRACING_LINUX_TRACING_UTILS_H_
9 changes: 9 additions & 0 deletions src/LinuxTracing/LinuxTracingUtilsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,4 +405,13 @@ TEST(FindFunctionsThatUprobesCannotInstrumentWithMessages, ModuleNotInMaps) {
"not loaded by the process."));
}

TEST(RetrieveInitialTidToRootNamespaceTidMapping, TrivialMapFromTestProcess) {
const pid_t pid = orbit_base::ToNativeThreadId(orbit_base::GetCurrentProcessId());
const auto tid_mappings = RetrieveInitialTidToRootNamespaceTidMapping(pid);
EXPECT_FALSE(tid_mappings.empty());
for (const auto& tid_mapping : tid_mappings) {
EXPECT_EQ(tid_mapping.first, tid_mapping.second);
}
}

} // namespace orbit_linux_tracing
10 changes: 9 additions & 1 deletion src/LinuxTracing/PerfEvent.h
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,14 @@ struct SchedSwitchWithStackPerfEventData {
};
using SchedSwitchWithStackPerfEvent = TypedPerfEvent<SchedSwitchWithStackPerfEventData>;

struct CloneExitPerfEventData {
// The tid of the thread invoking clone (in the root namespace).
pid_t tid;
// The return value of clone. This is the tid of the new thread (in the target process namespace).
pid_t ret_tid;
};
using CloneExitPerfEvent = TypedPerfEvent<CloneExitPerfEventData>;

// This struct holds the data we need from any of the possible perf_event_open events that we
// collect. The top-level fields (`timestamp` and `ordered_in_file_descriptor`) are common to all
// events, while each of the possible `...PerfEventData`s in the `std::variant` contains the data
Expand Down Expand Up @@ -411,7 +419,7 @@ struct PerfEvent {
TaskRenamePerfEventData, SchedSwitchPerfEventData, SchedWakeupPerfEventData,
SchedSwitchWithStackPerfEventData, SchedWakeupWithStackPerfEventData,
AmdgpuCsIoctlPerfEventData, AmdgpuSchedRunJobPerfEventData,
DmaFenceSignaledPerfEventData>
DmaFenceSignaledPerfEventData, CloneExitPerfEventData>
data;

void Accept(PerfEventVisitor* visitor) const;
Expand Down
23 changes: 23 additions & 0 deletions src/LinuxTracing/PerfEventReaders.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -685,4 +685,27 @@ ConsumeAmdgpuSchedRunJobPerfEvent(PerfEventRingBuffer* ring_buffer,
header);
}

[[nodiscard]] CloneExitPerfEvent ConsumeCloneExitPerfEvent(PerfEventRingBuffer* ring_buffer,
const perf_event_header& header) {
const perf_event_attr flags{
.sample_type = PERF_SAMPLE_RAW | SAMPLE_TYPE_TID_TIME_STREAMID_CPU,
};

PerfRecordSample res = ConsumeRecordSample(ring_buffer, header, flags);

syscall_exit_tracepoint sys_exit;
std::memcpy(&sys_exit, res.raw_data.get(), sizeof(syscall_exit_tracepoint));

ring_buffer->SkipRecord(header);
return CloneExitPerfEvent{
.timestamp = res.time,
.ordered_stream = PerfEventOrderedStream::FileDescriptor(ring_buffer->GetFileDescriptor()),
.data =
{
.tid = static_cast<pid_t>(res.tid),
.ret_tid = static_cast<pid_t>(sys_exit.ret),
},
};
}

} // namespace orbit_linux_tracing
3 changes: 3 additions & 0 deletions src/LinuxTracing/PerfEventReaders.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ AmdgpuSchedRunJobPerfEvent ConsumeAmdgpuSchedRunJobPerfEvent(PerfEventRingBuffer

DmaFenceSignaledPerfEvent ConsumeDmaFenceSignaledPerfEvent(PerfEventRingBuffer* ring_buffer,
const perf_event_header& header);

CloneExitPerfEvent ConsumeCloneExitPerfEvent(PerfEventRingBuffer* ring_buffer,
const perf_event_header& header);
} // namespace orbit_linux_tracing

#endif // LINUX_TRACING_PERF_EVENT_READERS_H_
1 change: 1 addition & 0 deletions src/LinuxTracing/PerfEventVisitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class PerfEventVisitor {
const DmaFenceSignaledPerfEventData& /*event_data*/) {}
virtual void Visit(uint64_t /*event_timestamp*/,
const GenericTracepointPerfEventData& /*event_data*/) {}
virtual void Visit(uint64_t /*event_timestamp*/, const CloneExitPerfEventData& /*event_data*/) {}
};

} // namespace orbit_linux_tracing
Expand Down
48 changes: 39 additions & 9 deletions src/LinuxTracing/TracerImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <absl/meta/type_traits.h>
#include <absl/strings/str_format.h>
#include <absl/strings/str_join.h>
#include <absl/strings/str_split.h>
#include <absl/synchronization/mutex.h>
#include <stddef.h>
#include <string.h>
Expand Down Expand Up @@ -220,6 +221,11 @@ void TracerImpl::InitUprobesEventVisitor() {
&absolute_address_to_size_of_functions_to_stop_unwinding_at_);
uprobes_unwinding_visitor_->SetUnwindErrorsAndDiscardedSamplesCounters(
&stats_.unwind_error_count, &stats_.samples_in_uretprobes_count);
// Get the initial mapping of the tids in the target process to the corresponing tids in the
// root namespace.
absl::flat_hash_map<pid_t, pid_t> tid_mappings =
RetrieveInitialTidToRootNamespaceTidMapping(target_pid_);
uprobes_unwinding_visitor_->SetInitialTidToRootNamespaceTidMapping(std::move(tid_mappings));
event_processor_.AddVisitor(uprobes_unwinding_visitor_.get());
}

Expand Down Expand Up @@ -677,6 +683,20 @@ bool TracerImpl::OpenInstrumentedTracepoints(const std::vector<int32_t>& cpus) {
return !tracepoint_event_open_errors;
}

// We maintain a map of tids from the target process pid namespace to the root namespace. When a new
// thread is created the new tid in the root namespace is reported by the task:task_newtask which is
// already opened by OpenThreadNameTracepoints. The respective tid in the target process namespace
// is obtained from the next hit to syscalls:sys_exit_clone or syscalls:sys_exit_clone3.
bool TracerImpl::OpenCloneExitTracepoints(const std::vector<int32_t>& cpus) {
ORBIT_SCOPE_FUNCTION;
absl::flat_hash_map<int32_t, int> pid_mapping_tracepoint_ring_buffer_fds_per_cpu;
return OpenFileDescriptorsAndRingBuffersForAllTracepoints(
{{"syscalls", "sys_exit_clone", &sys_exit_clone_ids_},
{"syscalls", "sys_exit_clone3", &sys_exit_clone3_ids_}},
cpus, &tracing_fds_, CLONE_EXIT_RING_BUFFER_SIZE_KB,
&pid_mapping_tracepoint_ring_buffer_fds_per_cpu, &ring_buffers_, stack_dump_size_);
}

void TracerImpl::InitLostAndDiscardedEventVisitor() {
ORBIT_SCOPE_FUNCTION;
lost_and_discarded_event_visitor_ = std::make_unique<LostAndDiscardedEventVisitor>(listener_);
Expand Down Expand Up @@ -752,6 +772,11 @@ void TracerImpl::Startup() {
bool perf_event_open_errors = false;
std::vector<std::string> perf_event_open_error_details;

if (bool opened = OpenCloneExitTracepoints(all_cpus); !opened) {
perf_event_open_error_details.emplace_back("clone exit tracepoints");
perf_event_open_errors = true;
}

if (bool opened = OpenMmapTask(all_cpus); !opened) {
perf_event_open_error_details.emplace_back("mmap events, fork and exit events");
perf_event_open_errors = true;
Expand Down Expand Up @@ -1120,15 +1145,17 @@ uint64_t TracerImpl::ProcessSampleEventAndReturnTimestamp(const perf_event_heade
bool is_amdgpu_sched_run_job_event = amdgpu_sched_run_job_ids_.contains(stream_id);
bool is_dma_fence_signaled_event = dma_fence_signaled_ids_.contains(stream_id);
bool is_user_instrumented_tracepoint = ids_to_tracepoint_info_.contains(stream_id);

ORBIT_CHECK(is_uprobe + is_uprobe_with_args + is_uprobe_with_stack + is_uretprobe +
is_uretprobe_with_retval + is_stack_sample + is_callchain_sample +
is_task_newtask + is_task_rename + is_sched_switch + is_sched_wakeup +
is_sched_switch_with_callchain + is_sched_wakeup_with_callchain +
is_sched_switch_with_stack + is_sched_wakeup_with_stack +
is_amdgpu_cs_ioctl_event + is_amdgpu_sched_run_job_event +
is_dma_fence_signaled_event + is_user_instrumented_tracepoint <=
1);
bool is_clone_exit_tracepoint =
sys_exit_clone_ids_.contains(stream_id) || sys_exit_clone3_ids_.contains(stream_id);

ORBIT_CHECK(
is_uprobe + is_uprobe_with_args + is_uprobe_with_stack + is_uretprobe +
is_uretprobe_with_retval + is_stack_sample + is_callchain_sample + is_task_newtask +
is_task_rename + is_sched_switch + is_sched_wakeup + is_sched_switch_with_callchain +
is_sched_wakeup_with_callchain + is_sched_switch_with_stack + is_sched_wakeup_with_stack +
is_amdgpu_cs_ioctl_event + is_amdgpu_sched_run_job_event + is_dma_fence_signaled_event +
is_user_instrumented_tracepoint + is_clone_exit_tracepoint <=
1);

int fd = ring_buffer->GetFileDescriptor();

Expand Down Expand Up @@ -1410,6 +1437,9 @@ uint64_t TracerImpl::ProcessSampleEventAndReturnTimestamp(const perf_event_heade
tracepoint_info->set_category(it->second.category());

listener_->OnTracepointEvent(std::move(tracepoint_event));
} else if (is_clone_exit_tracepoint) {
CloneExitPerfEvent event = ConsumeCloneExitPerfEvent(ring_buffer, header);
DeferEvent(std::move(event));
} else {
ORBIT_ERROR("PERF_EVENT_SAMPLE with unexpected stream_id: %lu", stream_id);
ring_buffer->SkipRecord(header);
Expand Down
5 changes: 5 additions & 0 deletions src/LinuxTracing/TracerImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ class TracerImpl : public Tracer {

bool OpenInstrumentedTracepoints(const std::vector<int32_t>& cpus);

bool OpenCloneExitTracepoints(const std::vector<int32_t>& cpus);

void InitLostAndDiscardedEventVisitor();

[[nodiscard]] uint64_t ProcessForkEventAndReturnTimestamp(const perf_event_header& header,
Expand Down Expand Up @@ -135,6 +137,7 @@ class TracerImpl : public Tracer {
static constexpr uint64_t MMAP_TASK_RING_BUFFER_SIZE_KB = 64;
static constexpr uint64_t SAMPLING_RING_BUFFER_SIZE_KB = 16 * 1024;
static constexpr uint64_t THREAD_NAMES_RING_BUFFER_SIZE_KB = 64;
static constexpr uint64_t CLONE_EXIT_RING_BUFFER_SIZE_KB = 64;
static constexpr uint64_t CONTEXT_SWITCHES_AND_THREAD_STATE_RING_BUFFER_SIZE_KB = 2 * 1024;
static constexpr uint64_t CONTEXT_SWITCHES_AND_THREAD_STATE_WITH_STACKS_RING_BUFFER_SIZE_KB =
64 * 1024;
Expand Down Expand Up @@ -191,6 +194,8 @@ class TracerImpl : public Tracer {
absl::flat_hash_set<uint64_t> amdgpu_cs_ioctl_ids_;
absl::flat_hash_set<uint64_t> amdgpu_sched_run_job_ids_;
absl::flat_hash_set<uint64_t> dma_fence_signaled_ids_;
absl::flat_hash_set<uint64_t> sys_exit_clone_ids_;
absl::flat_hash_set<uint64_t> sys_exit_clone3_ids_;
absl::flat_hash_map<uint64_t, orbit_grpc_protos::TracepointInfo> ids_to_tracepoint_info_;

uint64_t effective_capture_start_timestamp_ns_ = 0;
Expand Down
91 changes: 84 additions & 7 deletions src/LinuxTracing/UprobesUnwindingVisitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -528,22 +528,50 @@ void UprobesUnwindingVisitor::Visit(uint64_t event_timestamp,

void UprobesUnwindingVisitor::Visit(uint64_t event_timestamp,
const UserSpaceFunctionEntryPerfEventData& event_data) {
function_call_manager_->ProcessFunctionEntry(event_data.tid, event_data.function_id,
event_timestamp, std::nullopt);
const auto& tid_to_root_namespace_tid_it = tid_to_root_namespace_tid_.find(event_data.tid);
if (tid_to_root_namespace_tid_it == tid_to_root_namespace_tid_.end()) {
ORBIT_ERROR_ONCE(
"Received function entry event from unknown thread with tid %d. Dropping this event and "
"also all subsequent events from unknown threads.",
event_data.tid);
return;
}
const pid_t tid = tid_to_root_namespace_tid_it->second;

function_call_manager_->ProcessFunctionEntry(tid, event_data.function_id, event_timestamp,
std::nullopt);

return_address_manager_->ProcessFunctionEntry(event_data.tid, event_data.sp,
event_data.return_address);
return_address_manager_->ProcessFunctionEntry(tid, event_data.sp, event_data.return_address);
}

void UprobesUnwindingVisitor::Visit(uint64_t event_timestamp,
const UserSpaceFunctionExitPerfEventData& event_data) {
std::optional<FunctionCall> function_call = function_call_manager_->ProcessFunctionExit(
event_data.pid, event_data.tid, event_timestamp, std::nullopt);
const auto& tid_to_root_namespace_tid_it = tid_to_root_namespace_tid_.find(event_data.tid);
if (tid_to_root_namespace_tid_it == tid_to_root_namespace_tid_.end()) {
ORBIT_ERROR_ONCE(
"Received function exit event from unknown thread with tid %d. Dropping this event and "
"also all subsequent user space dynamic instrumentation events from unknown threads.",
event_data.tid);
return;
}
const auto& pid_to_root_namespace_pid_it = tid_to_root_namespace_tid_.find(event_data.pid);
if (pid_to_root_namespace_pid_it == tid_to_root_namespace_tid_.end()) {
ORBIT_ERROR_ONCE(
"Received function exit event from unknown process with pid %d. Dropping this event and "
"also all subsequent user space dynamic instrumentation events from unknown processes.",
event_data.pid);
return;
}
const pid_t tid = tid_to_root_namespace_tid_it->second;
const pid_t pid = pid_to_root_namespace_pid_it->second;

std::optional<FunctionCall> function_call =
function_call_manager_->ProcessFunctionExit(pid, tid, event_timestamp, std::nullopt);
if (function_call.has_value()) {
listener_->OnFunctionCall(std::move(function_call.value()));
}

return_address_manager_->ProcessFunctionExit(event_data.tid);
return_address_manager_->ProcessFunctionExit(tid);
}

void UprobesUnwindingVisitor::Visit(uint64_t /*event_timestamp*/,
Expand Down Expand Up @@ -759,4 +787,53 @@ void UprobesUnwindingVisitor::Visit(uint64_t event_timestamp, const MmapPerfEven
listener_->OnModuleUpdate(std::move(module_update_event));
}

// We observe the task_newtask tracepoint. This gets triggered for each new thread that is created.
// It reports the tid of the parent thread and the tid of the new thread, both in the root
// namespace. We store the mapping from parent tid to new tid in
// new_task_root_namespace_parent_tid_to_root_namespace_tid_.
// Immediately after this the clone or clone3 syscall that caused the creation of the new thread
// returns. We also observe the tracepoint that tracks the return from the clone call (see below).
// The clone tracepoint also provides the pid of the parent thread (in the root namespace) and the
// return value of clone which is the tid of the new thread in the namespace of the target process.
// We use the parent tid to match these two tracepoints and by that obtain the mapping from the tid
// in the namespace of the target process to the tid in the root namespace. This mapping is stored
// in tid_to_root_namespace_tid_.
void UprobesUnwindingVisitor::Visit(uint64_t /*event_timestamp*/,
const TaskNewtaskPerfEventData& event_data) {
if (new_task_root_namespace_parent_tid_to_root_namespace_tid_.contains(
event_data.was_created_by_tid)) {
ORBIT_ERROR(
"Observed a task_newtask event from thread %d without matching clone exit event. This "
"should never happen.",
event_data.was_created_by_tid);
}
new_task_root_namespace_parent_tid_to_root_namespace_tid_[event_data.was_created_by_tid] =
event_data.new_tid;
}

void UprobesUnwindingVisitor::Visit(uint64_t /*event_timestamp*/,
const CloneExitPerfEventData& event_data) {
// If the return value of clone is zero this tracepoint is hit from the execution path of the
// newly created thread. We are not interested in these events and discard them.
if (event_data.ret_tid == 0) return;

const pid_t parent_tid = event_data.tid;
const pid_t tid_in_target_process_namespace = event_data.ret_tid;
const auto& new_task_root_namespace_parent_tid_to_root_namespace_tid_it =
new_task_root_namespace_parent_tid_to_root_namespace_tid_.find(parent_tid);
if (new_task_root_namespace_parent_tid_to_root_namespace_tid_it ==
new_task_root_namespace_parent_tid_to_root_namespace_tid_.end()) {
ORBIT_ERROR(
"Observed a return from clone without previously seeing a task_newtask from the same "
"parent thread. parent_tid was %d; clone return was %d. We will ignore user space dynamic "
"instrumentation form this thread.",
parent_tid, tid_in_target_process_namespace);
return;
}
tid_to_root_namespace_tid_[tid_in_target_process_namespace] =
new_task_root_namespace_parent_tid_to_root_namespace_tid_it->second;
new_task_root_namespace_parent_tid_to_root_namespace_tid_.erase(
new_task_root_namespace_parent_tid_to_root_namespace_tid_it);
}

} // namespace orbit_linux_tracing
Loading

0 comments on commit 0cff3f5

Please sign in to comment.