Skip to content

Commit

Permalink
i#7067 sched cpu limits: Add thread cpu footprint to schedule_stats (#…
Browse files Browse the repository at this point in the history
…7095)

Augments schedule_stats to record how many cores each thread appears on.
This is used to compute an overall histogram of cores per thread, which
is a useful metric for studying migrations over time. This is done in a
multi-workload-aware manner to avoid problems with duplicate thread ids
across workloads.

This adds new pure-virtual functions to the
schedule_stats_t::histogram_t interface, so subclasses will need to be
updated.
It also changes schedule_stats_t::counters_t.threads to use
workload_tid_t rather than just tid.

Adds some multi-workload support to the testing helpers in
default_memtrace_stream_t and memref_gen.

Adds a unit test and updates existing tests.

Issue: #7067
  • Loading branch information
derekbruening authored Nov 26, 2024
1 parent 4a2301b commit a7e58ee
Show file tree
Hide file tree
Showing 8 changed files with 167 additions and 56 deletions.
11 changes: 11 additions & 0 deletions clients/drcachesim/common/memtrace_stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,11 @@ class default_memtrace_stream_t : public memtrace_stream_t {
{
return shard_;
}
int64_t
get_workload_id() const override
{
return workload_id_;
}
// Also sets the shard index to the dynamic-discovery-order tid ordinal.
void
set_tid(int64_t tid)
Expand All @@ -413,12 +418,18 @@ class default_memtrace_stream_t : public memtrace_stream_t {
{
return tid_;
}
void
set_workload_id(int64_t id)
{
workload_id_ = id;
}

private:
uint64_t *record_ordinal_ = nullptr;
int64_t cpuid_ = 0;
int shard_ = 0;
int64_t tid_ = 0;
int64_t workload_id_ = 0;
// To let a test set just the tid and get a shard index for free.
std::unordered_map<int64_t, int> tid2shard_;
};
Expand Down
4 changes: 3 additions & 1 deletion clients/drcachesim/tests/core_serial.templatex
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Schedule stats tool results:
Total counts:
4 cores
8 threads: 1257.*
8 threads: W0.T1257.*
638938 instructions
6 total context switches
0.0093906 CSPKI \(context switches per 1000 instructions\)
Expand Down Expand Up @@ -35,6 +35,8 @@ Total counts:
Instructions per context switch histogram:
0.. 50000 2
50000.. 100000 4
Cores per thread:
1.. 2 8
Core #0 counts:
.*
Core #1 counts:
Expand Down
8 changes: 5 additions & 3 deletions clients/drcachesim/tests/memref_gen.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,21 +85,23 @@ gen_data(memref_tid_t tid, bool load, addr_t addr, size_t size)

inline memref_t
gen_instr_type(trace_type_t type, memref_tid_t tid, addr_t pc = 1, size_t size = 1,
addr_t indirect_branch_target = 0)
addr_t indirect_branch_target = 0, memref_pid_t pid = 0)
{
memref_t memref = {};
memref.instr.type = type;
memref.instr.tid = tid;
memref.instr.pid = pid;
memref.instr.addr = pc;
memref.instr.size = size;
memref.instr.indirect_branch_target = indirect_branch_target;
return memref;
}

inline memref_t
gen_instr(memref_tid_t tid, addr_t pc = 1, size_t size = 1)
gen_instr(memref_tid_t tid, addr_t pc = 1, size_t size = 1, memref_pid_t pid = 0)
{
return gen_instr_type(TRACE_TYPE_INSTR, tid, pc, size);
return gen_instr_type(TRACE_TYPE_INSTR, tid, pc, size, /*indirect_branch_target=*/0,
pid);
}

inline memref_t
Expand Down
2 changes: 1 addition & 1 deletion clients/drcachesim/tests/only_shards.templatex
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Schedule stats tool results:
Total counts:
2 cores
4 threads: 1257600, 1257602, 1257599, 1257603
4 threads: W0.T1257600, W0.T1257602, W0.T1257599, W0.T1257603
.*
Core #0 schedule: FJ_
Core #1 schedule: GI
12 changes: 7 additions & 5 deletions clients/drcachesim/tests/schedule_stats_nopreempt.templatex
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Schedule stats tool results:
Total counts:
4 cores
8 threads: 1257.*
8 threads: W0.T1257.*
638938 instructions
6 total context switches
0\.0093906 CSPKI \(context switches per 1000 instructions\)
Expand Down Expand Up @@ -35,8 +35,10 @@ Total counts:
Instructions per context switch histogram:
0.. 50000 2
50000.. 100000 4
Cores per thread:
1.. 2 8
Core #0 counts:
. threads: 1257.*
. threads: W0.T1257.*
*[0-9]* instructions
. total context switches
0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)
Expand All @@ -47,7 +49,7 @@ Core #0 counts:
0\.00% direct switches
.*
Core #1 counts:
. threads: 1257.*
. threads: W0.T1257.*
*[0-9]* instructions
. total context switches
0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)
Expand All @@ -58,7 +60,7 @@ Core #1 counts:
0\.00% direct switches
.*
Core #2 counts:
. threads: 1257.*
. threads: W0.T1257.*
*[0-9]* instructions
. total context switches
0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)
Expand All @@ -69,7 +71,7 @@ Core #2 counts:
0\.00% direct switches
.*
Core #3 counts:
. threads: 1257.*
. threads: W0.T1257.*
*[0-9]* instructions
. total context switches
0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)
Expand Down
46 changes: 45 additions & 1 deletion clients/drcachesim/tests/schedule_stats_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ run_schedule_stats(const std::vector<std::vector<memref_t>> &memrefs)
continue;
memref_t memref = memrefs[cpu][per_core[cpu].memref_idx];
per_core[cpu].stream.set_tid(memref.instr.tid);
per_core[cpu].stream.set_workload_id(memref.instr.pid);
per_core[cpu].stream.set_output_cpuid(cpu);
bool res = tool.parallel_shard_memref(per_core[cpu].shard_data, memref);
assert(res);
Expand Down Expand Up @@ -280,12 +281,55 @@ test_idle()
return true;
}

static bool
test_cpu_footprint()
{
static constexpr int64_t PID_X = 1234;
static constexpr int64_t PID_Y = 5678;
static constexpr int64_t TID_A = 42;
static constexpr int64_t TID_B = 142;
static constexpr int64_t TID_C = 242;
static constexpr addr_t INSTR_PC = 1001;
static constexpr size_t INSTR_SIZE = 4;
std::vector<std::vector<memref_t>> memrefs = {
{
gen_instr(TID_A, INSTR_PC, INSTR_SIZE, PID_X),
gen_instr(TID_B, INSTR_PC, INSTR_SIZE, PID_X),
// Test identical tids in different workloads.
gen_instr(TID_A, INSTR_PC, INSTR_SIZE, PID_Y),
},
{
gen_instr(TID_A, INSTR_PC, INSTR_SIZE, PID_Y),
},
{
gen_instr(TID_C, INSTR_PC, INSTR_SIZE, PID_Y),
gen_instr(TID_A, INSTR_PC, INSTR_SIZE, PID_X),
},
{
gen_instr(TID_C, INSTR_PC, INSTR_SIZE, PID_Y),
},
{
gen_instr(TID_C, INSTR_PC, INSTR_SIZE, PID_Y),
},
};
auto result = run_schedule_stats(memrefs);
assert(result.instrs == 8);
std::string hist = result.cores_per_thread->to_string();
std::cerr << "Cores-per-thread histogram:\n" << hist << "\n";
// We expect X.A=2, X.B=1, Y.A=2, Y.C=3:
assert(hist ==
" 1.. 2 1\n"
" 2.. 3 2\n"
" 3.. 4 1\n");
return true;
}

} // namespace

int
test_main(int argc, const char *argv[])
{
if (test_basic_stats() && test_idle()) {
if (test_basic_stats() && test_idle() && test_cpu_footprint()) {
std::cerr << "schedule_stats_test passed\n";
return 0;
}
Expand Down
20 changes: 18 additions & 2 deletions clients/drcachesim/tools/schedule_stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
shard->saw_exit = false;
}
if (memref.instr.tid != INVALID_THREAD_ID)
shard->counters.threads.insert(memref.instr.tid);
shard->counters.threads.insert(workload_tid_t(workload_id, memref.instr.tid));
if (memref.marker.type == TRACE_TYPE_MARKER) {
if (memref.marker.marker_type == TRACE_MARKER_TYPE_SYSCALL) {
++shard->counters.syscalls;
Expand Down Expand Up @@ -397,7 +397,7 @@ schedule_stats_t::print_counters(const counters_t &counters)
std::cerr << ": ";
auto it = counters.threads.begin();
while (it != counters.threads.end()) {
std::cerr << *it;
std::cerr << "W" << it->workload_id << ".T" << it->tid;
++it;
if (it != counters.threads.end())
std::cerr << ", ";
Expand Down Expand Up @@ -467,17 +467,27 @@ schedule_stats_t::print_counters(const counters_t &counters)
"% cpu busy by time, ignoring idle past last instr\n");
std::cerr << " Instructions per context switch histogram:\n";
counters.instrs_per_switch->print();
if (!counters.cores_per_thread->empty()) {
std::cerr << " Cores per thread:\n";
counters.cores_per_thread->print();
}
}

void
schedule_stats_t::aggregate_results(counters_t &total)
{
std::unordered_map<workload_tid_t, std::unordered_set<int64_t>, workload_tid_hash_t>
cpu_footprint;
for (const auto &shard : shard_map_) {
// First update our per-shard data with per-shard stats from the scheduler.
get_scheduler_stats(shard.second->stream, shard.second->counters);

total += shard.second->counters;

for (const workload_tid_t wtid : shard.second->counters.threads) {
cpu_footprint[wtid].insert(shard.second->core);
}

// Sanity check against the scheduler's own stats, unless the trace
// is pre-scheduled, or we're in core-serial mode where we don't have access
// to the separate output streams, or we're in a unit test with a mock
Expand Down Expand Up @@ -508,6 +518,12 @@ schedule_stats_t::aggregate_results(counters_t &total)
// will be 0; for mock streams in tests it will be < 0; otherwise, the scheduler
// may see more migrations due to inputs not yet executed moving among runqueues.
assert(total.migrations <= 0. || total.migrations >= total.observed_migrations);

// The += had no data to add as we do not update cores_per_thread incrementally
// and instead derive it from counters_t.threads via cpu_footprint here.
for (const auto &entry : cpu_footprint) {
total.cores_per_thread->add(entry.second.size());
}
}

bool
Expand Down
Loading

0 comments on commit a7e58ee

Please sign in to comment.