diff --git a/clients/drcachesim/common/memtrace_stream.h b/clients/drcachesim/common/memtrace_stream.h index c8e59945624..b479ff5785c 100644 --- a/clients/drcachesim/common/memtrace_stream.h +++ b/clients/drcachesim/common/memtrace_stream.h @@ -394,6 +394,11 @@ class default_memtrace_stream_t : public memtrace_stream_t { { return shard_; } + int64_t + get_workload_id() const override + { + return workload_id_; + } // Also sets the shard index to the dynamic-discovery-order tid ordinal. void set_tid(int64_t tid) @@ -413,12 +418,18 @@ class default_memtrace_stream_t : public memtrace_stream_t { { return tid_; } + void + set_workload_id(int64_t id) + { + workload_id_ = id; + } private: uint64_t *record_ordinal_ = nullptr; int64_t cpuid_ = 0; int shard_ = 0; int64_t tid_ = 0; + int64_t workload_id_ = 0; // To let a test set just the tid and get a shard index for free. std::unordered_map tid2shard_; }; diff --git a/clients/drcachesim/tests/core_serial.templatex b/clients/drcachesim/tests/core_serial.templatex index 62314d5eecb..1fdc9b85311 100644 --- a/clients/drcachesim/tests/core_serial.templatex +++ b/clients/drcachesim/tests/core_serial.templatex @@ -1,7 +1,7 @@ Schedule stats tool results: Total counts: 4 cores - 8 threads: 1257.* + 8 threads: W0.T1257.* 638938 instructions 6 total context switches 0.0093906 CSPKI \(context switches per 1000 instructions\) @@ -35,6 +35,8 @@ Total counts: Instructions per context switch histogram: 0.. 50000 2 50000.. 100000 4 + Cores per thread: + 1.. 2 8 Core #0 counts: .* Core #1 counts: diff --git a/clients/drcachesim/tests/memref_gen.h b/clients/drcachesim/tests/memref_gen.h index 785b49e2b45..1515bd915e1 100644 --- a/clients/drcachesim/tests/memref_gen.h +++ b/clients/drcachesim/tests/memref_gen.h @@ -85,11 +85,12 @@ gen_data(memref_tid_t tid, bool load, addr_t addr, size_t size) inline memref_t gen_instr_type(trace_type_t type, memref_tid_t tid, addr_t pc = 1, size_t size = 1, - addr_t indirect_branch_target = 0) + addr_t indirect_branch_target = 0, memref_pid_t pid = 0) { memref_t memref = {}; memref.instr.type = type; memref.instr.tid = tid; + memref.instr.pid = pid; memref.instr.addr = pc; memref.instr.size = size; memref.instr.indirect_branch_target = indirect_branch_target; @@ -97,9 +98,10 @@ gen_instr_type(trace_type_t type, memref_tid_t tid, addr_t pc = 1, size_t size = } inline memref_t -gen_instr(memref_tid_t tid, addr_t pc = 1, size_t size = 1) +gen_instr(memref_tid_t tid, addr_t pc = 1, size_t size = 1, memref_pid_t pid = 0) { - return gen_instr_type(TRACE_TYPE_INSTR, tid, pc, size); + return gen_instr_type(TRACE_TYPE_INSTR, tid, pc, size, /*indirect_branch_target=*/0, + pid); } inline memref_t diff --git a/clients/drcachesim/tests/only_shards.templatex b/clients/drcachesim/tests/only_shards.templatex index 44770a1b358..dd88dc4460c 100644 --- a/clients/drcachesim/tests/only_shards.templatex +++ b/clients/drcachesim/tests/only_shards.templatex @@ -1,7 +1,7 @@ Schedule stats tool results: Total counts: 2 cores - 4 threads: 1257600, 1257602, 1257599, 1257603 + 4 threads: W0.T1257600, W0.T1257602, W0.T1257599, W0.T1257603 .* Core #0 schedule: FJ_ Core #1 schedule: GI diff --git a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex index 40246ff30a5..b04b4d291de 100644 --- a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex +++ b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex @@ -1,7 +1,7 @@ Schedule stats tool results: Total counts: 4 cores - 8 threads: 1257.* + 8 threads: W0.T1257.* 638938 instructions 6 total context switches 0\.0093906 CSPKI \(context switches per 1000 instructions\) @@ -35,8 +35,10 @@ Total counts: Instructions per context switch histogram: 0.. 50000 2 50000.. 100000 4 + Cores per thread: + 1.. 2 8 Core #0 counts: - . threads: 1257.* + . threads: W0.T1257.* *[0-9]* instructions . total context switches 0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\) @@ -47,7 +49,7 @@ Core #0 counts: 0\.00% direct switches .* Core #1 counts: - . threads: 1257.* + . threads: W0.T1257.* *[0-9]* instructions . total context switches 0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\) @@ -58,7 +60,7 @@ Core #1 counts: 0\.00% direct switches .* Core #2 counts: - . threads: 1257.* + . threads: W0.T1257.* *[0-9]* instructions . total context switches 0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\) @@ -69,7 +71,7 @@ Core #2 counts: 0\.00% direct switches .* Core #3 counts: - . threads: 1257.* + . threads: W0.T1257.* *[0-9]* instructions . total context switches 0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\) diff --git a/clients/drcachesim/tests/schedule_stats_test.cpp b/clients/drcachesim/tests/schedule_stats_test.cpp index 75f7db93d9e..caf5820e2c3 100644 --- a/clients/drcachesim/tests/schedule_stats_test.cpp +++ b/clients/drcachesim/tests/schedule_stats_test.cpp @@ -122,6 +122,7 @@ run_schedule_stats(const std::vector> &memrefs) continue; memref_t memref = memrefs[cpu][per_core[cpu].memref_idx]; per_core[cpu].stream.set_tid(memref.instr.tid); + per_core[cpu].stream.set_workload_id(memref.instr.pid); per_core[cpu].stream.set_output_cpuid(cpu); bool res = tool.parallel_shard_memref(per_core[cpu].shard_data, memref); assert(res); @@ -280,12 +281,55 @@ test_idle() return true; } +static bool +test_cpu_footprint() +{ + static constexpr int64_t PID_X = 1234; + static constexpr int64_t PID_Y = 5678; + static constexpr int64_t TID_A = 42; + static constexpr int64_t TID_B = 142; + static constexpr int64_t TID_C = 242; + static constexpr addr_t INSTR_PC = 1001; + static constexpr size_t INSTR_SIZE = 4; + std::vector> memrefs = { + { + gen_instr(TID_A, INSTR_PC, INSTR_SIZE, PID_X), + gen_instr(TID_B, INSTR_PC, INSTR_SIZE, PID_X), + // Test identical tids in different workloads. + gen_instr(TID_A, INSTR_PC, INSTR_SIZE, PID_Y), + }, + { + gen_instr(TID_A, INSTR_PC, INSTR_SIZE, PID_Y), + }, + { + gen_instr(TID_C, INSTR_PC, INSTR_SIZE, PID_Y), + gen_instr(TID_A, INSTR_PC, INSTR_SIZE, PID_X), + }, + { + gen_instr(TID_C, INSTR_PC, INSTR_SIZE, PID_Y), + }, + { + gen_instr(TID_C, INSTR_PC, INSTR_SIZE, PID_Y), + }, + }; + auto result = run_schedule_stats(memrefs); + assert(result.instrs == 8); + std::string hist = result.cores_per_thread->to_string(); + std::cerr << "Cores-per-thread histogram:\n" << hist << "\n"; + // We expect X.A=2, X.B=1, Y.A=2, Y.C=3: + assert(hist == + " 1.. 2 1\n" + " 2.. 3 2\n" + " 3.. 4 1\n"); + return true; +} + } // namespace int test_main(int argc, const char *argv[]) { - if (test_basic_stats() && test_idle()) { + if (test_basic_stats() && test_idle() && test_cpu_footprint()) { std::cerr << "schedule_stats_test passed\n"; return 0; } diff --git a/clients/drcachesim/tools/schedule_stats.cpp b/clients/drcachesim/tools/schedule_stats.cpp index 979d3df588e..5d4b26b6b81 100644 --- a/clients/drcachesim/tools/schedule_stats.cpp +++ b/clients/drcachesim/tools/schedule_stats.cpp @@ -354,7 +354,7 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref shard->saw_exit = false; } if (memref.instr.tid != INVALID_THREAD_ID) - shard->counters.threads.insert(memref.instr.tid); + shard->counters.threads.insert(workload_tid_t(workload_id, memref.instr.tid)); if (memref.marker.type == TRACE_TYPE_MARKER) { if (memref.marker.marker_type == TRACE_MARKER_TYPE_SYSCALL) { ++shard->counters.syscalls; @@ -397,7 +397,7 @@ schedule_stats_t::print_counters(const counters_t &counters) std::cerr << ": "; auto it = counters.threads.begin(); while (it != counters.threads.end()) { - std::cerr << *it; + std::cerr << "W" << it->workload_id << ".T" << it->tid; ++it; if (it != counters.threads.end()) std::cerr << ", "; @@ -467,17 +467,27 @@ schedule_stats_t::print_counters(const counters_t &counters) "% cpu busy by time, ignoring idle past last instr\n"); std::cerr << " Instructions per context switch histogram:\n"; counters.instrs_per_switch->print(); + if (!counters.cores_per_thread->empty()) { + std::cerr << " Cores per thread:\n"; + counters.cores_per_thread->print(); + } } void schedule_stats_t::aggregate_results(counters_t &total) { + std::unordered_map, workload_tid_hash_t> + cpu_footprint; for (const auto &shard : shard_map_) { // First update our per-shard data with per-shard stats from the scheduler. get_scheduler_stats(shard.second->stream, shard.second->counters); total += shard.second->counters; + for (const workload_tid_t wtid : shard.second->counters.threads) { + cpu_footprint[wtid].insert(shard.second->core); + } + // Sanity check against the scheduler's own stats, unless the trace // is pre-scheduled, or we're in core-serial mode where we don't have access // to the separate output streams, or we're in a unit test with a mock @@ -508,6 +518,12 @@ schedule_stats_t::aggregate_results(counters_t &total) // will be 0; for mock streams in tests it will be < 0; otherwise, the scheduler // may see more migrations due to inputs not yet executed moving among runqueues. assert(total.migrations <= 0. || total.migrations >= total.observed_migrations); + + // The += had no data to add as we do not update cores_per_thread incrementally + // and instead derive it from counters_t.threads via cpu_footprint here. + for (const auto &entry : cpu_footprint) { + total.cores_per_thread->add(entry.second.size()); + } } bool diff --git a/clients/drcachesim/tools/schedule_stats.h b/clients/drcachesim/tools/schedule_stats.h index 6056ef6965f..cc082a84232 100644 --- a/clients/drcachesim/tools/schedule_stats.h +++ b/clients/drcachesim/tools/schedule_stats.h @@ -89,14 +89,21 @@ class schedule_stats_t : public analysis_tool_t { add(int64_t value) = 0; virtual void merge(const histogram_interface_t *rhs) = 0; + virtual std::string + to_string() const = 0; virtual void print() const = 0; + virtual bool + empty() const = 0; }; // Simple binning histogram for instrs-per-switch distribution. class histogram_t : public histogram_interface_t { public: - histogram_t() = default; + explicit histogram_t(uint64_t bin_size) + : bin_size_(bin_size) + { + } void add(int64_t value) override @@ -104,7 +111,7 @@ class schedule_stats_t : public analysis_tool_t { // XXX: Add dynamic bin size changing. // For now with relatively known data ranges we just stick // with unchanging bin sizes. - uint64_t bin = value - (value % kInitialBinSize); + uint64_t bin = value - (value % bin_size_); ++bin2count_[bin]; } @@ -117,27 +124,78 @@ class schedule_stats_t : public analysis_tool_t { } } - void - print() const override + std::string + to_string() const override { + std::ostringstream stream; for (const auto &keyval : bin2count_) { - std::cerr << std::setw(12) << keyval.first << ".." << std::setw(8) - << keyval.first + kInitialBinSize << " " << std::setw(5) - << keyval.second << "\n"; + stream << std::setw(12) << keyval.first << ".." << std::setw(8) + << keyval.first + bin_size_ << " " << std::setw(5) << keyval.second + << "\n"; } + return stream.str(); + } + + void + print() const override + { + std::cerr << to_string(); + } + + bool + empty() const override + { + return bin2count_.empty(); } protected: - static constexpr uint64_t kInitialBinSize = 50000; + uint64_t bin_size_; // Key is the inclusive lower bound of the bin. std::map bin2count_; }; + struct workload_tid_t { + workload_tid_t(int64_t workload, int64_t thread) + : workload_id(workload) + , tid(thread) + { + } + bool + operator==(const workload_tid_t &rhs) const + { + return workload_id == rhs.workload_id && tid == rhs.tid; + } + + bool + operator!=(const workload_tid_t &rhs) const + { + return !(*this == rhs); + } + int64_t workload_id; + int64_t tid; + }; + + struct workload_tid_hash_t { + std::size_t + operator()(const workload_tid_t &wt) const + { + // Ensure {workload_id=X, tid=Y} doesn't always hash the same as + // {workload_id=Y, tid=X} by avoiding a simple symmetric wid^tid. + return std::hash()(static_cast(wt.workload_id ^ wt.tid)) ^ + std::hash()(wt.tid); + } + }; + struct counters_t { counters_t() { - instrs_per_switch = std::unique_ptr(new histogram_t); + static constexpr uint64_t kSwitchBinSize = 50000; + static constexpr uint64_t kCoresBinSize = 1; + instrs_per_switch = + std::unique_ptr(new histogram_t(kSwitchBinSize)); + cores_per_thread = + std::unique_ptr(new histogram_t(kCoresBinSize)); } counters_t & operator+=(const counters_t &rhs) @@ -165,10 +223,13 @@ class schedule_stats_t : public analysis_tool_t { idle_micros_at_last_instr += rhs.idle_micros_at_last_instr; cpu_microseconds += rhs.cpu_microseconds; wait_microseconds += rhs.wait_microseconds; - for (const memref_tid_t tid : rhs.threads) { + for (const workload_tid_t tid : rhs.threads) { threads.insert(tid); } instrs_per_switch->merge(rhs.instrs_per_switch.get()); + // We do not track this incrementally but for completeness we include + // aggregation code for it. + cores_per_thread->merge(rhs.cores_per_thread.get()); return *this; } // Statistics provided by scheduler. @@ -203,40 +264,13 @@ class schedule_stats_t : public analysis_tool_t { uint64_t idle_micros_at_last_instr = 0; uint64_t cpu_microseconds = 0; uint64_t wait_microseconds = 0; - std::unordered_set threads; + std::unordered_set threads; std::unique_ptr instrs_per_switch; - }; - - struct workload_tid_t { - workload_tid_t(int64_t workload, int64_t thread) - : workload_id(workload) - , tid(thread) - { - } - bool - operator==(const workload_tid_t &rhs) const - { - return workload_id == rhs.workload_id && tid == rhs.tid; - } - - bool - operator!=(const workload_tid_t &rhs) const - { - return !(*this == rhs); - } - int64_t workload_id; - int64_t tid; - }; - - struct workload_tid_hash_t { - std::size_t - operator()(const workload_tid_t &wt) const - { - // Ensure {workload_id=X, tid=Y} doesn't always hash the same as - // {workload_id=Y, tid=X} by avoiding a simple symmetric wid^tid. - return std::hash()(static_cast(wt.workload_id ^ wt.tid)) ^ - std::hash()(wt.tid); - } + // CPU footprint of each thread. This is computable during aggregation from + // the .threads field above so we don't bother to track this incrementally. + // We still store it inside counters_t as this structure is assumed in + // several places to hold all aggregated statistics. + std::unique_ptr cores_per_thread; }; counters_t