diff --git a/clients/drcachesim/common/memtrace_stream.h b/clients/drcachesim/common/memtrace_stream.h
index c8e59945624..b479ff5785c 100644
--- a/clients/drcachesim/common/memtrace_stream.h
+++ b/clients/drcachesim/common/memtrace_stream.h
@@ -394,6 +394,11 @@ class default_memtrace_stream_t : public memtrace_stream_t {
     {
         return shard_;
     }
+    int64_t
+    get_workload_id() const override
+    {
+        return workload_id_;
+    }
     // Also sets the shard index to the dynamic-discovery-order tid ordinal.
     void
     set_tid(int64_t tid)
@@ -413,12 +418,18 @@ class default_memtrace_stream_t : public memtrace_stream_t {
     {
         return tid_;
     }
+    void
+    set_workload_id(int64_t id)
+    {
+        workload_id_ = id;
+    }
 
 private:
     uint64_t *record_ordinal_ = nullptr;
     int64_t cpuid_ = 0;
     int shard_ = 0;
     int64_t tid_ = 0;
+    int64_t workload_id_ = 0;
     // To let a test set just the tid and get a shard index for free.
     std::unordered_map<int64_t, int> tid2shard_;
 };
diff --git a/clients/drcachesim/tests/core_serial.templatex b/clients/drcachesim/tests/core_serial.templatex
index 62314d5eecb..1fdc9b85311 100644
--- a/clients/drcachesim/tests/core_serial.templatex
+++ b/clients/drcachesim/tests/core_serial.templatex
@@ -1,7 +1,7 @@
 Schedule stats tool results:
 Total counts:
            4 cores
-           8 threads: 1257.*
+           8 threads: W0.T1257.*
       638938 instructions
            6 total context switches
    0.0093906 CSPKI \(context switches per 1000 instructions\)
@@ -35,6 +35,8 @@ Total counts:
   Instructions per context switch histogram:
            0..   50000     2
        50000..  100000     4
+  Cores per thread:
+           1..       2     8
 Core #0 counts:
 .*
 Core #1 counts:
diff --git a/clients/drcachesim/tests/memref_gen.h b/clients/drcachesim/tests/memref_gen.h
index 785b49e2b45..1515bd915e1 100644
--- a/clients/drcachesim/tests/memref_gen.h
+++ b/clients/drcachesim/tests/memref_gen.h
@@ -85,11 +85,12 @@ gen_data(memref_tid_t tid, bool load, addr_t addr, size_t size)
 
 inline memref_t
 gen_instr_type(trace_type_t type, memref_tid_t tid, addr_t pc = 1, size_t size = 1,
-               addr_t indirect_branch_target = 0)
+               addr_t indirect_branch_target = 0, memref_pid_t pid = 0)
 {
     memref_t memref = {};
     memref.instr.type = type;
     memref.instr.tid = tid;
+    memref.instr.pid = pid;
     memref.instr.addr = pc;
     memref.instr.size = size;
     memref.instr.indirect_branch_target = indirect_branch_target;
@@ -97,9 +98,10 @@ gen_instr_type(trace_type_t type, memref_tid_t tid, addr_t pc = 1, size_t size =
 }
 
 inline memref_t
-gen_instr(memref_tid_t tid, addr_t pc = 1, size_t size = 1)
+gen_instr(memref_tid_t tid, addr_t pc = 1, size_t size = 1, memref_pid_t pid = 0)
 {
-    return gen_instr_type(TRACE_TYPE_INSTR, tid, pc, size);
+    return gen_instr_type(TRACE_TYPE_INSTR, tid, pc, size, /*indirect_branch_target=*/0,
+                          pid);
 }
 
 inline memref_t
diff --git a/clients/drcachesim/tests/only_shards.templatex b/clients/drcachesim/tests/only_shards.templatex
index 44770a1b358..dd88dc4460c 100644
--- a/clients/drcachesim/tests/only_shards.templatex
+++ b/clients/drcachesim/tests/only_shards.templatex
@@ -1,7 +1,7 @@
 Schedule stats tool results:
 Total counts:
            2 cores
-           4 threads: 1257600, 1257602, 1257599, 1257603
+           4 threads: W0.T1257600, W0.T1257602, W0.T1257599, W0.T1257603
 .*
 Core #0 schedule: FJ_
 Core #1 schedule: GI
diff --git a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
index 40246ff30a5..b04b4d291de 100644
--- a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
+++ b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
@@ -1,7 +1,7 @@
 Schedule stats tool results:
 Total counts:
            4 cores
-           8 threads: 1257.*
+           8 threads: W0.T1257.*
       638938 instructions
            6 total context switches
    0\.0093906 CSPKI \(context switches per 1000 instructions\)
@@ -35,8 +35,10 @@ Total counts:
   Instructions per context switch histogram:
            0..   50000     2
        50000..  100000     4
+  Cores per thread:
+           1..       2     8
 Core #0 counts:
-           . threads: 1257.*
+           . threads: W0.T1257.*
       *[0-9]* instructions
            . total context switches
    0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)
@@ -47,7 +49,7 @@ Core #0 counts:
         0\.00% direct switches
 .*
 Core #1 counts:
-           . threads: 1257.*
+           . threads: W0.T1257.*
       *[0-9]* instructions
            . total context switches
    0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)
@@ -58,7 +60,7 @@ Core #1 counts:
         0\.00% direct switches
 .*
 Core #2 counts:
-           . threads: 1257.*
+           . threads: W0.T1257.*
       *[0-9]* instructions
            . total context switches
    0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)
@@ -69,7 +71,7 @@ Core #2 counts:
         0\.00% direct switches
 .*
 Core #3 counts:
-           . threads: 1257.*
+           . threads: W0.T1257.*
       *[0-9]* instructions
            . total context switches
    0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)
diff --git a/clients/drcachesim/tests/schedule_stats_test.cpp b/clients/drcachesim/tests/schedule_stats_test.cpp
index 75f7db93d9e..caf5820e2c3 100644
--- a/clients/drcachesim/tests/schedule_stats_test.cpp
+++ b/clients/drcachesim/tests/schedule_stats_test.cpp
@@ -122,6 +122,7 @@ run_schedule_stats(const std::vector<std::vector<memref_t>> &memrefs)
                 continue;
             memref_t memref = memrefs[cpu][per_core[cpu].memref_idx];
             per_core[cpu].stream.set_tid(memref.instr.tid);
+            per_core[cpu].stream.set_workload_id(memref.instr.pid);
             per_core[cpu].stream.set_output_cpuid(cpu);
             bool res = tool.parallel_shard_memref(per_core[cpu].shard_data, memref);
             assert(res);
@@ -280,12 +281,55 @@ test_idle()
     return true;
 }
 
+static bool
+test_cpu_footprint()
+{
+    static constexpr int64_t PID_X = 1234;
+    static constexpr int64_t PID_Y = 5678;
+    static constexpr int64_t TID_A = 42;
+    static constexpr int64_t TID_B = 142;
+    static constexpr int64_t TID_C = 242;
+    static constexpr addr_t INSTR_PC = 1001;
+    static constexpr size_t INSTR_SIZE = 4;
+    std::vector<std::vector<memref_t>> memrefs = {
+        {
+            gen_instr(TID_A, INSTR_PC, INSTR_SIZE, PID_X),
+            gen_instr(TID_B, INSTR_PC, INSTR_SIZE, PID_X),
+            // Test identical tids in different workloads.
+            gen_instr(TID_A, INSTR_PC, INSTR_SIZE, PID_Y),
+        },
+        {
+            gen_instr(TID_A, INSTR_PC, INSTR_SIZE, PID_Y),
+        },
+        {
+            gen_instr(TID_C, INSTR_PC, INSTR_SIZE, PID_Y),
+            gen_instr(TID_A, INSTR_PC, INSTR_SIZE, PID_X),
+        },
+        {
+            gen_instr(TID_C, INSTR_PC, INSTR_SIZE, PID_Y),
+        },
+        {
+            gen_instr(TID_C, INSTR_PC, INSTR_SIZE, PID_Y),
+        },
+    };
+    auto result = run_schedule_stats(memrefs);
+    assert(result.instrs == 8);
+    std::string hist = result.cores_per_thread->to_string();
+    std::cerr << "Cores-per-thread histogram:\n" << hist << "\n";
+    // We expect X.A=2, X.B=1, Y.A=2, Y.C=3:
+    assert(hist ==
+           "           1..       2     1\n"
+           "           2..       3     2\n"
+           "           3..       4     1\n");
+    return true;
+}
+
 } // namespace
 
 int
 test_main(int argc, const char *argv[])
 {
-    if (test_basic_stats() && test_idle()) {
+    if (test_basic_stats() && test_idle() && test_cpu_footprint()) {
         std::cerr << "schedule_stats_test passed\n";
         return 0;
     }
diff --git a/clients/drcachesim/tools/schedule_stats.cpp b/clients/drcachesim/tools/schedule_stats.cpp
index 979d3df588e..5d4b26b6b81 100644
--- a/clients/drcachesim/tools/schedule_stats.cpp
+++ b/clients/drcachesim/tools/schedule_stats.cpp
@@ -354,7 +354,7 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
         shard->saw_exit = false;
     }
     if (memref.instr.tid != INVALID_THREAD_ID)
-        shard->counters.threads.insert(memref.instr.tid);
+        shard->counters.threads.insert(workload_tid_t(workload_id, memref.instr.tid));
     if (memref.marker.type == TRACE_TYPE_MARKER) {
         if (memref.marker.marker_type == TRACE_MARKER_TYPE_SYSCALL) {
             ++shard->counters.syscalls;
@@ -397,7 +397,7 @@ schedule_stats_t::print_counters(const counters_t &counters)
         std::cerr << ": ";
         auto it = counters.threads.begin();
         while (it != counters.threads.end()) {
-            std::cerr << *it;
+            std::cerr << "W" << it->workload_id << ".T" << it->tid;
             ++it;
             if (it != counters.threads.end())
                 std::cerr << ", ";
@@ -467,17 +467,27 @@ schedule_stats_t::print_counters(const counters_t &counters)
                      "% cpu busy by time, ignoring idle past last instr\n");
     std::cerr << "  Instructions per context switch histogram:\n";
     counters.instrs_per_switch->print();
+    if (!counters.cores_per_thread->empty()) {
+        std::cerr << "  Cores per thread:\n";
+        counters.cores_per_thread->print();
+    }
 }
 
 void
 schedule_stats_t::aggregate_results(counters_t &total)
 {
+    std::unordered_map<workload_tid_t, std::unordered_set<int64_t>, workload_tid_hash_t>
+        cpu_footprint;
     for (const auto &shard : shard_map_) {
         // First update our per-shard data with per-shard stats from the scheduler.
         get_scheduler_stats(shard.second->stream, shard.second->counters);
 
         total += shard.second->counters;
 
+        for (const workload_tid_t wtid : shard.second->counters.threads) {
+            cpu_footprint[wtid].insert(shard.second->core);
+        }
+
         // Sanity check against the scheduler's own stats, unless the trace
         // is pre-scheduled, or we're in core-serial mode where we don't have access
         // to the separate output streams, or we're in a unit test with a mock
@@ -508,6 +518,12 @@ schedule_stats_t::aggregate_results(counters_t &total)
     // will be 0; for mock streams in tests it will be < 0; otherwise, the scheduler
     // may see more migrations due to inputs not yet executed moving among runqueues.
     assert(total.migrations <= 0. || total.migrations >= total.observed_migrations);
+
+    // The += had no data to add as we do not update cores_per_thread incrementally
+    // and instead derive it from counters_t.threads via cpu_footprint here.
+    for (const auto &entry : cpu_footprint) {
+        total.cores_per_thread->add(entry.second.size());
+    }
 }
 
 bool
diff --git a/clients/drcachesim/tools/schedule_stats.h b/clients/drcachesim/tools/schedule_stats.h
index 6056ef6965f..cc082a84232 100644
--- a/clients/drcachesim/tools/schedule_stats.h
+++ b/clients/drcachesim/tools/schedule_stats.h
@@ -89,14 +89,21 @@ class schedule_stats_t : public analysis_tool_t {
         add(int64_t value) = 0;
         virtual void
         merge(const histogram_interface_t *rhs) = 0;
+        virtual std::string
+        to_string() const = 0;
         virtual void
         print() const = 0;
+        virtual bool
+        empty() const = 0;
     };
 
     // Simple binning histogram for instrs-per-switch distribution.
     class histogram_t : public histogram_interface_t {
     public:
-        histogram_t() = default;
+        explicit histogram_t(uint64_t bin_size)
+            : bin_size_(bin_size)
+        {
+        }
 
         void
         add(int64_t value) override
@@ -104,7 +111,7 @@ class schedule_stats_t : public analysis_tool_t {
             // XXX: Add dynamic bin size changing.
             // For now with relatively known data ranges we just stick
             // with unchanging bin sizes.
-            uint64_t bin = value - (value % kInitialBinSize);
+            uint64_t bin = value - (value % bin_size_);
             ++bin2count_[bin];
         }
 
@@ -117,27 +124,78 @@ class schedule_stats_t : public analysis_tool_t {
             }
         }
 
-        void
-        print() const override
+        std::string
+        to_string() const override
         {
+            std::ostringstream stream;
             for (const auto &keyval : bin2count_) {
-                std::cerr << std::setw(12) << keyval.first << ".." << std::setw(8)
-                          << keyval.first + kInitialBinSize << " " << std::setw(5)
-                          << keyval.second << "\n";
+                stream << std::setw(12) << keyval.first << ".." << std::setw(8)
+                       << keyval.first + bin_size_ << " " << std::setw(5) << keyval.second
+                       << "\n";
             }
+            return stream.str();
+        }
+
+        void
+        print() const override
+        {
+            std::cerr << to_string();
+        }
+
+        bool
+        empty() const override
+        {
+            return bin2count_.empty();
         }
 
     protected:
-        static constexpr uint64_t kInitialBinSize = 50000;
+        uint64_t bin_size_;
 
         // Key is the inclusive lower bound of the bin.
         std::map<uint64_t, uint64_t> bin2count_;
     };
 
+    struct workload_tid_t {
+        workload_tid_t(int64_t workload, int64_t thread)
+            : workload_id(workload)
+            , tid(thread)
+        {
+        }
+        bool
+        operator==(const workload_tid_t &rhs) const
+        {
+            return workload_id == rhs.workload_id && tid == rhs.tid;
+        }
+
+        bool
+        operator!=(const workload_tid_t &rhs) const
+        {
+            return !(*this == rhs);
+        }
+        int64_t workload_id;
+        int64_t tid;
+    };
+
+    struct workload_tid_hash_t {
+        std::size_t
+        operator()(const workload_tid_t &wt) const
+        {
+            // Ensure {workload_id=X, tid=Y} doesn't always hash the same as
+            // {workload_id=Y, tid=X} by avoiding a simple symmetric wid^tid.
+            return std::hash<size_t>()(static_cast<size_t>(wt.workload_id ^ wt.tid)) ^
+                std::hash<memref_tid_t>()(wt.tid);
+        }
+    };
+
     struct counters_t {
         counters_t()
         {
-            instrs_per_switch = std::unique_ptr<histogram_interface_t>(new histogram_t);
+            static constexpr uint64_t kSwitchBinSize = 50000;
+            static constexpr uint64_t kCoresBinSize = 1;
+            instrs_per_switch =
+                std::unique_ptr<histogram_interface_t>(new histogram_t(kSwitchBinSize));
+            cores_per_thread =
+                std::unique_ptr<histogram_interface_t>(new histogram_t(kCoresBinSize));
         }
         counters_t &
         operator+=(const counters_t &rhs)
@@ -165,10 +223,13 @@ class schedule_stats_t : public analysis_tool_t {
             idle_micros_at_last_instr += rhs.idle_micros_at_last_instr;
             cpu_microseconds += rhs.cpu_microseconds;
             wait_microseconds += rhs.wait_microseconds;
-            for (const memref_tid_t tid : rhs.threads) {
+            for (const workload_tid_t tid : rhs.threads) {
                 threads.insert(tid);
             }
             instrs_per_switch->merge(rhs.instrs_per_switch.get());
+            // We do not track this incrementally but for completeness we include
+            // aggregation code for it.
+            cores_per_thread->merge(rhs.cores_per_thread.get());
             return *this;
         }
         // Statistics provided by scheduler.
@@ -203,40 +264,13 @@ class schedule_stats_t : public analysis_tool_t {
         uint64_t idle_micros_at_last_instr = 0;
         uint64_t cpu_microseconds = 0;
         uint64_t wait_microseconds = 0;
-        std::unordered_set<memref_tid_t> threads;
+        std::unordered_set<workload_tid_t, workload_tid_hash_t> threads;
         std::unique_ptr<histogram_interface_t> instrs_per_switch;
-    };
-
-    struct workload_tid_t {
-        workload_tid_t(int64_t workload, int64_t thread)
-            : workload_id(workload)
-            , tid(thread)
-        {
-        }
-        bool
-        operator==(const workload_tid_t &rhs) const
-        {
-            return workload_id == rhs.workload_id && tid == rhs.tid;
-        }
-
-        bool
-        operator!=(const workload_tid_t &rhs) const
-        {
-            return !(*this == rhs);
-        }
-        int64_t workload_id;
-        int64_t tid;
-    };
-
-    struct workload_tid_hash_t {
-        std::size_t
-        operator()(const workload_tid_t &wt) const
-        {
-            // Ensure {workload_id=X, tid=Y} doesn't always hash the same as
-            // {workload_id=Y, tid=X} by avoiding a simple symmetric wid^tid.
-            return std::hash<size_t>()(static_cast<size_t>(wt.workload_id ^ wt.tid)) ^
-                std::hash<memref_tid_t>()(wt.tid);
-        }
+        // CPU footprint of each thread. This is computable during aggregation from
+        // the .threads field above so we don't bother to track this incrementally.
+        // We still store it inside counters_t as this structure is assumed in
+        // several places to hold all aggregated statistics.
+        std::unique_ptr<histogram_interface_t> cores_per_thread;
     };
 
     counters_t