i#7067 sched cpu limits: Add thread cpu footprint to schedule_stats (#…

…7095) Augments schedule_stats to record how many cores each thread appears on. This is used to compute an overall histogram of cores per thread, which is a useful metric for studying migrations over time. This is done in a multi-workload-aware manner to avoid problems with duplicate thread ids across workloads. This adds new pure-virtual functions to the schedule_stats_t::histogram_t interface, so subclasses will need to be updated. It also changes schedule_stats_t::counters_t.threads to use workload_tid_t rather than just tid. Adds some multi-workload support to the testing helpers in default_memtrace_stream_t and memref_gen. Adds a unit test and updates existing tests. Issue: #7067
DynamoRIO · Nov 26, 2024 · a7e58ee · a7e58ee
1 parent 4a2301b
commit a7e58ee
Show file tree

Hide file tree

Showing 8 changed files with 167 additions and 56 deletions.
diff --git a/clients/drcachesim/common/memtrace_stream.h b/clients/drcachesim/common/memtrace_stream.h
@@ -394,6 +394,11 @@ class default_memtrace_stream_t : public memtrace_stream_t {
     {
         return shard_;
     }
+    int64_t
+    get_workload_id() const override
+    {
+        return workload_id_;
+    }
     // Also sets the shard index to the dynamic-discovery-order tid ordinal.
     void
     set_tid(int64_t tid)
@@ -413,12 +418,18 @@ class default_memtrace_stream_t : public memtrace_stream_t {
     {
         return tid_;
     }
+    void
+    set_workload_id(int64_t id)
+    {
+        workload_id_ = id;
+    }
 
 private:
     uint64_t *record_ordinal_ = nullptr;
     int64_t cpuid_ = 0;
     int shard_ = 0;
     int64_t tid_ = 0;
+    int64_t workload_id_ = 0;
     // To let a test set just the tid and get a shard index for free.
     std::unordered_map<int64_t, int> tid2shard_;
 };

diff --git a/clients/drcachesim/tests/core_serial.templatex b/clients/drcachesim/tests/core_serial.templatex
@@ -1,7 +1,7 @@
 Schedule stats tool results:
 Total counts:
            4 cores
-           8 threads: 1257.*
+           8 threads: W0.T1257.*
       638938 instructions
            6 total context switches
    0.0093906 CSPKI \(context switches per 1000 instructions\)
@@ -35,6 +35,8 @@ Total counts:
   Instructions per context switch histogram:
            0..   50000     2
        50000..  100000     4
+  Cores per thread:
+           1..       2     8
 Core #0 counts:
 .*
 Core #1 counts:

diff --git a/clients/drcachesim/tests/memref_gen.h b/clients/drcachesim/tests/memref_gen.h
@@ -85,21 +85,23 @@ gen_data(memref_tid_t tid, bool load, addr_t addr, size_t size)
 
 inline memref_t
 gen_instr_type(trace_type_t type, memref_tid_t tid, addr_t pc = 1, size_t size = 1,
-               addr_t indirect_branch_target = 0)
+               addr_t indirect_branch_target = 0, memref_pid_t pid = 0)
 {
     memref_t memref = {};
     memref.instr.type = type;
     memref.instr.tid = tid;
+    memref.instr.pid = pid;
     memref.instr.addr = pc;
     memref.instr.size = size;
     memref.instr.indirect_branch_target = indirect_branch_target;
     return memref;
 }
 
 inline memref_t
-gen_instr(memref_tid_t tid, addr_t pc = 1, size_t size = 1)
+gen_instr(memref_tid_t tid, addr_t pc = 1, size_t size = 1, memref_pid_t pid = 0)
 {
-    return gen_instr_type(TRACE_TYPE_INSTR, tid, pc, size);
+    return gen_instr_type(TRACE_TYPE_INSTR, tid, pc, size, /*indirect_branch_target=*/0,
+                          pid);
 }
 
 inline memref_t

diff --git a/clients/drcachesim/tests/only_shards.templatex b/clients/drcachesim/tests/only_shards.templatex
@@ -1,7 +1,7 @@
 Schedule stats tool results:
 Total counts:
            2 cores
-           4 threads: 1257600, 1257602, 1257599, 1257603
+           4 threads: W0.T1257600, W0.T1257602, W0.T1257599, W0.T1257603
 .*
 Core #0 schedule: FJ_
 Core #1 schedule: GI
diff --git a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
@@ -1,7 +1,7 @@
 Schedule stats tool results:
 Total counts:
            4 cores
-           8 threads: 1257.*
+           8 threads: W0.T1257.*
       638938 instructions
            6 total context switches
    0\.0093906 CSPKI \(context switches per 1000 instructions\)
@@ -35,8 +35,10 @@ Total counts:
   Instructions per context switch histogram:
            0..   50000     2
        50000..  100000     4
+  Cores per thread:
+           1..       2     8
 Core #0 counts:
-           . threads: 1257.*
+           . threads: W0.T1257.*
       *[0-9]* instructions
            . total context switches
    0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)
@@ -47,7 +49,7 @@ Core #0 counts:
         0\.00% direct switches
 .*
 Core #1 counts:
-           . threads: 1257.*
+           . threads: W0.T1257.*
       *[0-9]* instructions
            . total context switches
    0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)
@@ -58,7 +60,7 @@ Core #1 counts:
         0\.00% direct switches
 .*
 Core #2 counts:
-           . threads: 1257.*
+           . threads: W0.T1257.*
       *[0-9]* instructions
            . total context switches
    0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)
@@ -69,7 +71,7 @@ Core #2 counts:
         0\.00% direct switches
 .*
 Core #3 counts:
-           . threads: 1257.*
+           . threads: W0.T1257.*
       *[0-9]* instructions
            . total context switches
    0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)

diff --git a/clients/drcachesim/tests/schedule_stats_test.cpp b/clients/drcachesim/tests/schedule_stats_test.cpp
@@ -122,6 +122,7 @@ run_schedule_stats(const std::vector<std::vector<memref_t>> &memrefs)
                 continue;
             memref_t memref = memrefs[cpu][per_core[cpu].memref_idx];
             per_core[cpu].stream.set_tid(memref.instr.tid);
+            per_core[cpu].stream.set_workload_id(memref.instr.pid);
             per_core[cpu].stream.set_output_cpuid(cpu);
             bool res = tool.parallel_shard_memref(per_core[cpu].shard_data, memref);
             assert(res);
@@ -280,12 +281,55 @@ test_idle()
     return true;
 }
 
+static bool
+test_cpu_footprint()
+{
+    static constexpr int64_t PID_X = 1234;
+    static constexpr int64_t PID_Y = 5678;
+    static constexpr int64_t TID_A = 42;
+    static constexpr int64_t TID_B = 142;
+    static constexpr int64_t TID_C = 242;
+    static constexpr addr_t INSTR_PC = 1001;
+    static constexpr size_t INSTR_SIZE = 4;
+    std::vector<std::vector<memref_t>> memrefs = {
+        {
+            gen_instr(TID_A, INSTR_PC, INSTR_SIZE, PID_X),
+            gen_instr(TID_B, INSTR_PC, INSTR_SIZE, PID_X),
+            // Test identical tids in different workloads.
+            gen_instr(TID_A, INSTR_PC, INSTR_SIZE, PID_Y),
+        },
+        {
+            gen_instr(TID_A, INSTR_PC, INSTR_SIZE, PID_Y),
+        },
+        {
+            gen_instr(TID_C, INSTR_PC, INSTR_SIZE, PID_Y),
+            gen_instr(TID_A, INSTR_PC, INSTR_SIZE, PID_X),
+        },
+        {
+            gen_instr(TID_C, INSTR_PC, INSTR_SIZE, PID_Y),
+        },
+        {
+            gen_instr(TID_C, INSTR_PC, INSTR_SIZE, PID_Y),
+        },
+    };
+    auto result = run_schedule_stats(memrefs);
+    assert(result.instrs == 8);
+    std::string hist = result.cores_per_thread->to_string();
+    std::cerr << "Cores-per-thread histogram:\n" << hist << "\n";
+    // We expect X.A=2, X.B=1, Y.A=2, Y.C=3:
+    assert(hist ==
+           "           1..       2     1\n"
+           "           2..       3     2\n"
+           "           3..       4     1\n");
+    return true;
+}
+
 } // namespace
 
 int
 test_main(int argc, const char *argv[])
 {
-    if (test_basic_stats() && test_idle()) {
+    if (test_basic_stats() && test_idle() && test_cpu_footprint()) {
         std::cerr << "schedule_stats_test passed\n";
         return 0;
     }

diff --git a/clients/drcachesim/tools/schedule_stats.cpp b/clients/drcachesim/tools/schedule_stats.cpp
@@ -354,7 +354,7 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
         shard->saw_exit = false;
     }
     if (memref.instr.tid != INVALID_THREAD_ID)
-        shard->counters.threads.insert(memref.instr.tid);
+        shard->counters.threads.insert(workload_tid_t(workload_id, memref.instr.tid));
     if (memref.marker.type == TRACE_TYPE_MARKER) {
         if (memref.marker.marker_type == TRACE_MARKER_TYPE_SYSCALL) {
             ++shard->counters.syscalls;
@@ -397,7 +397,7 @@ schedule_stats_t::print_counters(const counters_t &counters)
         std::cerr << ": ";
         auto it = counters.threads.begin();
         while (it != counters.threads.end()) {
-            std::cerr << *it;
+            std::cerr << "W" << it->workload_id << ".T" << it->tid;
             ++it;
             if (it != counters.threads.end())
                 std::cerr << ", ";
@@ -467,17 +467,27 @@ schedule_stats_t::print_counters(const counters_t &counters)
                      "% cpu busy by time, ignoring idle past last instr\n");
     std::cerr << "  Instructions per context switch histogram:\n";
     counters.instrs_per_switch->print();
+    if (!counters.cores_per_thread->empty()) {
+        std::cerr << "  Cores per thread:\n";
+        counters.cores_per_thread->print();
+    }
 }
 
 void
 schedule_stats_t::aggregate_results(counters_t &total)
 {
+    std::unordered_map<workload_tid_t, std::unordered_set<int64_t>, workload_tid_hash_t>
+        cpu_footprint;
     for (const auto &shard : shard_map_) {
         // First update our per-shard data with per-shard stats from the scheduler.
         get_scheduler_stats(shard.second->stream, shard.second->counters);
 
         total += shard.second->counters;
 
+        for (const workload_tid_t wtid : shard.second->counters.threads) {
+            cpu_footprint[wtid].insert(shard.second->core);
+        }
+
         // Sanity check against the scheduler's own stats, unless the trace
         // is pre-scheduled, or we're in core-serial mode where we don't have access
         // to the separate output streams, or we're in a unit test with a mock
@@ -508,6 +518,12 @@ schedule_stats_t::aggregate_results(counters_t &total)
     // will be 0; for mock streams in tests it will be < 0; otherwise, the scheduler
     // may see more migrations due to inputs not yet executed moving among runqueues.
     assert(total.migrations <= 0. || total.migrations >= total.observed_migrations);
+
+    // The += had no data to add as we do not update cores_per_thread incrementally
+    // and instead derive it from counters_t.threads via cpu_footprint here.
+    for (const auto &entry : cpu_footprint) {
+        total.cores_per_thread->add(entry.second.size());
+    }
 }
 
 bool