diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 4e14d3c..186e8f2 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -10,6 +10,8 @@ env:
   # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
   BUILD_TYPE: Release
 
+
+
 jobs:
   build:
     # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
@@ -21,20 +23,40 @@ jobs:
     - uses: actions/checkout@v3
 
     - name: Install dependencies
-      run: sudo apt install llvm-dev clang libbpf-dev libclang-dev libcxxopts-dev libfmt-dev librange-v3-dev
+      run: sudo apt install llvm-dev clang libbpf-dev libclang-dev python3-pip gcc-13 g++-13 ninja-build && git submodule update --init --recursive
 
-    - name: Install Custom dependencies
-      run: wget http://launchpadlibrarian.net/605552811/libbpf0_0.8.0-1_amd64.deb && wget http://launchpadlibrarian.net/605552807/libbpf-dev_0.8.0-1_amd64.deb && sudo dpkg -i ./libbpf0_0.8.0-1_amd64.deb && sudo dpkg -i ./libbpf-dev_0.8.0-1_amd64.deb 
-
-    - name: Sed Current uncompiled include file
-      run: sudo sed -i 's/NL_SET_ERR_MSG_MOD/\/\/NL_SET_ERR_MSG_MOD/g' /usr/src/linux-headers-`uname -r`/include/net/flow_offload.h
+    - name: Install conan
+      working-directory: ${{github.workspace}}
+      run: pip3 install conan && conan profile detect && mkdir build && cd build && cp ../conanfile.txt . && CC=gcc-13 CXX=g++-13 conan install . -s compiler.cppstd=gnu23
 
     - name: Configure CMake
       # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
       # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
-      run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
+      run: cmake -B ${{github.workspace}}/build -GNinja -DCMAKE_MAKE_PROGRAM=ninja -DCMAKE_C_COMPILER=gcc-13 -DCMAKE_CXX_COMPILER=g++-13 -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DCMAKE_TOOLCHAIN_FILE=${{github.workspace}}/build/build/${{env.BUILD_TYPE}}/generators/conan_toolchain.cmake -DCMAKE_POLICY_DEFAULT_CMP0091=NEW
 
     - name: Build
       # Build your program with the given configuration
       run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
 
+    - name: Create Release
+      id: create_release
+      uses: actions/create-release@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token
+      with:
+        tag_name: main
+        release_name: Release main
+        body: |
+          Changes in this Release
+          - First Change
+          - Second Change
+        draft: false
+        prerelease: false
+
+    - name: Upload Assets to Release with a wildcard
+      uses: csexton/release-asset-action@v2
+      with:
+        pattern: "build/CXLMemSim"
+        github-token: ${{ secrets.GITHUB_TOKEN }}
+        release-url: ${{ steps.create_release.outputs.upload_url }}
+
diff --git a/.gitignore b/.gitignore
index 561b9f0..7ad5b53 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,11 @@ benchmarks*
 Makefile
 *.o
 py_smdk_pkg
-lib
\ No newline at end of file
+lib
+CMakePresets.json
+.cmake
+CMakeUserPresets.json
+*~
+voltdb
+foo
+CMakeFiles
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..6544518
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,15 @@
+[submodule "workloads/memcached-ycsb"]
+	path = workloads/memcached-ycsb
+	url = https://github.com/SlugLab/YCSB/
+[submodule "workloads/memcached"]
+	path = workloads/memcached
+	url = https://github.com/memcached/memcached
+[submodule "workloads/gapbs"]
+	path = workloads/gapbs
+	url = https://github.com/victoryang00/gapbs
+[submodule "script/perfmon"]
+	path = script/perfmon
+	url = https://github.com/intel/perfmon
+[submodule "workloads/vectordb"]
+	path = workloads/vectordb
+	url = https://github.com/jina-ai/vectordb
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4b6fc86..7dd1584 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,29 +1,27 @@
 cmake_minimum_required(VERSION 3.11.0)
-project(CXL-MEM-Simulator VERSION 0.1.0)
+project(CXLMemSim VERSION 0.1.0)
+set(CMAKE_CXX_STANDARD 23)
+
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+    add_compile_options (-fdiagnostics-color=always)
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+    add_compile_options (-fcolor-diagnostics)
+endif ()
+
+list(APPEND CMAKE_PREFIX_PATH ${CMAKE_BINARY_DIR})
 
 find_package(cxxopts REQUIRED)
 find_package(fmt REQUIRED)
-find_package(range-v3 REQUIRED)
 file(GLOB_RECURSE SOURCE_FILES src/*.cpp)
 
 execute_process(COMMAND uname -r OUTPUT_VARIABLE arch OUTPUT_STRIP_TRAILING_WHITESPACE)
-set(LINUX_SOURCE /lib/modules/${arch}/build/)
-set(CMAKE_CXX_FLAGS "-Wall -g -pthread -lrt -rdynamic")
-set(CMAKE_CXX_STANDARD 23)
-
-add_executable(CXL-MEM-Simulator ${SOURCE_FILES})
-
-include_directories(CXL-MEM-Simulator include)
-target_link_libraries(CXL-MEM-Simulator cxxopts::cxxopts fmt::fmt range-v3::range-v3 elf bpf)
+set(CMAKE_CXX_FLAGS "-Wall -fPIC -pthread -ldl -lrt -mavx512f -mpreferred-stack-boundary=4 -g -O0")
 
-function(bpf prefix)
-    add_custom_target(${prefix}_bpf ALL
-            COMMAND clang --target=bpf -nostdinc -S -I/usr/include/linux -I${CMAKE_SOURCE_DIR}/include -I${LINUX_SOURCE}/arch/x86/include -I/usr/include -I${LINUX_SOURCE}/arch/x86/include/uapi -I${LINUX_SOURCE}/arch/x86/include/generated -I${LINUX_SOURCE}/arch/x86/include/generated/uapi -I${LINUX_SOURCE}/include -I${LINUX_SOURCE}/include/uapi -I${LINUX_SOURCE}/include/generated/uapi -I${LINUX_SOURCE}/tools/testing/selftests/bpf -include ${LINUX_SOURCE}/include/linux/kconfig.h -D__KERNEL__ -D__ASM_SYSREG_H -D__BPF_TRACING__ -D__TARGET_ARCH_x86 -Wno-implicit-function-declaration -O3 -emit-llvm -g -c ${CMAKE_SOURCE_DIR}/src/${prefix}.c -o ${CMAKE_BINARY_DIR}/${prefix}.ll
-            COMMAND llc -march=bpf -filetype=obj -o ${CMAKE_BINARY_DIR}/${prefix}.o ${CMAKE_BINARY_DIR}/${prefix}.ll
-            )
-    add_dependencies(CXL-MEM-Simulator ${prefix}_bpf)
-endfunction()
+add_executable(CXLMemSim ${SOURCE_FILES} src/main.cc)
 
-bpf(collectmmap)
+include_directories(CXLMemSim include ${cxxopts_INCLUDE_DIR} ${fmt_INCLUDE_DIR})
+target_link_libraries(CXLMemSim fmt::fmt cxxopts::cxxopts)
 
-add_subdirectory(microbench)
\ No newline at end of file
+add_library(CXLMemSimHook SHARED src/module.cc)
+add_executable(CXLMemSimSock ${SOURCE_FILES} src/sock.cc)
+target_link_libraries(CXLMemSimSock fmt::fmt cxxopts::cxxopts)
diff --git a/README.md b/README.md
index 2d2c453..4f54b24 100644
--- a/README.md
+++ b/README.md
@@ -1,20 +1,20 @@
 # CXL.mem Simulator
-The epoch design of this project is mostly refering to [mes](https://github.com/takahiro-hirofuchi/mesmeric-emulator), the novelty is use pebs to construct the topology and calculate the hierachy latency based on this. See the [talk](https://docs.google.com/file/d/1bZi2rbB-u5xMw_YET726gb2s9QuxMZJE/edit?usp=docslist_api&filetype=mspresentation)
+The CXL.mem simulator is to use the target latency for simulating the CPU perspective taking ROB and different cacheline state's into panelty from the application level.
 
 ## Prerequisite
 ```bash
 $ uname -a
-Linux gpu01 5.19.0-29-generic #30-Ubuntu SMP PREEMPT_DYNAMIC Wed Jan 4 12:14:09 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
-$ sudo apt install llvm-dev clang libbpf-dev libclang-dev libcxxopts-dev libfmt-dev librange-v3-dev
+Linux banana 6.4.0+ #86 SMP PREEMPT_DYNAMIC Fri Jul 28 23:49:33 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
+$ echo 0 | sudo tee /sys/devices/system/node/node1/cpu*/online >/dev/null 2>&1
 ```
 ## User input
 ```bash
-LOGV=1 ./CXL-MEM-Simulator -t ./microbench/many_calloc -i 5 -c 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+LOGV=1 ./CXL-MEM-Simulator -t ./microbench/ld -i 5 -c 0,2 -d 85 -b 10,10 -l 100,100 -c 100,100 -w 85.5,86.5,87.5,85.5,86.5,87.5,88. -o "(1,(2,3))"
 ```
 1. -t Target: The path to the executable
 2. -i Interval: The epoch of the simulator, the parameter is in milisecond
 3. -c CPUSet: The core id to run the executable and the rest will be `setaffinity` to one other core
-4. -d Dram Latency: The current platform's DRAM latency, default is 85ns
+4. -d Dram Latency: The current platform's DRAM latency, default is 85ns # mark that bw in the remote
 5. -b, -l Bandwidth, Latency: Both use 2 input in the vector, first for read, second for write
 6. -c Capacity: The capacity of the memory with first be local, remaining accordingly to the input vector.
 7. -w Weight: Use the heuristic to calculate the bandwidth
@@ -30,16 +30,3 @@ LOGV=1 ./CXL-MEM-Simulator -t ./microbench/many_calloc -i 5 -c 0,1,2,3,4,5,6,7,8
                   3
 ```
 9. env LOGV stands for logs level that you can see.
-## Limitation
-The pebs requires no larger than 5 `perf_open_event` attached to certain PID, so I limit the bpf program to munmap(kprobe) and sbrk(kprobe/kretprobe), you can configure them. For multiple process application, I need to first SIGSTOP the process and `send/recv` back the PID information. For client and server application, I need to SIGSTOP/SIGCONT on both client and server simultaneously, which is not implemented yet.
-
-## Cite
-```bash
-@article{yangyarch23,
-  title={CXLMemSim: A pure software simulated CXL.mem for performance characterization},
-  author={Yiwei Yang, Pooneh Safayenikoo, Jiacheng Ma, Tanvir Ahmed Khan, Andrew Quinn},
-  journal={arXiv preprint arXiv:2303.06153},
-  booktitle={The fifth Young Architect Workshop (YArch'23)},
-  year={2023}
-}
-```
\ No newline at end of file
diff --git a/artifact/build_and_run_all.sh b/artifact/build_and_run_all.sh
deleted file mode 100755
index 29eeac1..0000000
--- a/artifact/build_and_run_all.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-sudo apt install llvm-dev clang libbpf-dev libclang-dev libcxxopts-dev libfmt-dev librange-v3-dev ninja-build
-
-mkdir build
-cd build
-cmake -GNinja ..
-ninja
-
-sudo bash -c "LOGV=0 ./CXL-MEM-Simulator -t './microbench/many_calloc'" > many_calloc_cxlmemsim.txt
-time ./microbench/many_calloc > many_calloc_time.txt
-sudo bash -c "LOGV=0 ./CXL-MEM-Simulator -t './microbench/many_mmap_write'" > many_mmap_write_cxlmemsim.txt
-time ./microbench/many_mmap_write > many_mmap_write_time.txt
-sudo bash -c "LOGV=0 ./CXL-MEM-Simulator -t './microbench/many_mmap_read'" > many_mmap_read_cxlmemsim.txt
-time ./microbench/many_mmap_read > many_mmap_read_time.txt
-sudo bash -c "LOGV=0 ./CXL-MEM-Simulator -t './microbench/many_malloc'" > many_malloc_cxlmemsim.txt
-time ./microbench/many_malloc > many_malloc_time.txt
-sudo bash -c "LOGV=0 ./CXL-MEM-Simulator -t './microbench/many_sbrk'" > many_sbrk_cxlmemsim.txt
-time ./microbench/many_sbrk > many_sbrk_time.txt
\ No newline at end of file
diff --git a/artifact/compare_with_gem5.sh b/artifact/compare_with_gem5.sh
deleted file mode 100644
index aa17ac5..0000000
--- a/artifact/compare_with_gem5.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-cd build
-git clone https://github.com/fadedzipper/gem5-cxl -b cxl.mem-dev
-cd gem5-cxl
-scons build/ARM/gem5.opt -j 16
-time build/X86/gem5.opt configs/example/se.py -c ../microbench/many_calloc > many_calloc_gem5.txt
-time build/X86/gem5.opt configs/example/se.py -c ../microbench/many_mmap_write > many_mmap_write_gem5.txt
-time build/X86/gem5.opt configs/example/se.py -c ../microbench/many_mmap_read > many_mmap_read_gem5.txt
-time build/X86/gem5.opt configs/example/se.py -c ../microbench/many_malloc > many_malloc_gem5.txt
-time build/X86/gem5.opt configs/example/se.py -c ../microbench/many_sbrk > many_sbrk_gem5.txt
\ No newline at end of file
diff --git a/artifact/gen_workloads.sh b/artifact/gen_workloads.sh
new file mode 100644
index 0000000..8330531
--- /dev/null
+++ b/artifact/gen_workloads.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# anns
+# monetdb
+# pointer_chasing lmbench3
+# wrf stream   
+# mlc
+# gromacs
+# smdk's
+
+git clone https://github.com/scott-beamer/gapbs.git
+cd gapbs
+make benchmark
+
+wget https://files.rcsb.org/download/4i4f.pdb
\ No newline at end of file
diff --git a/artifact/mlc.txt b/artifact/mlc-alderlake.txt
similarity index 100%
rename from artifact/mlc.txt
rename to artifact/mlc-alderlake.txt
diff --git a/artifact/mlc-sapphirerapids.txt b/artifact/mlc-sapphirerapids.txt
new file mode 100644
index 0000000..f3d8da5
--- /dev/null
+++ b/artifact/mlc-sapphirerapids.txt
@@ -0,0 +1,55 @@
+Intel(R) Memory Latency Checker - v3.10
+*** Unable to modify prefetchers (try executing 'modprobe msr')
+*** So, enabling random access for latency measurements
+Measuring idle latencies for random access (in ns)...
+                Numa node   Numa node
+Numa node             0           1
+       0          106.3       437.5
+
+Measuring Peak Injection Memory Bandwidths for the system
+Bandwidths are in MB/sec (1 MB/sec = 1,000,000 Bytes/sec)
+Using all the threads from each core if Hyper-threading is enabled
+Using traffic with the following read-write ratios
+ALL Reads        :      28611.9
+3:1 Reads-Writes :      25057.1
+2:1 Reads-Writes :      25078.0
+1:1 Reads-Writes :      23965.9
+Stream-triad like:      24943.3
+
+Measuring Memory Bandwidths between nodes within system 
+Bandwidths are in MB/sec (1 MB/sec = 1,000,000 Bytes/sec)
+Using all the threads from each core if Hyper-threading is enabled
+Using Read-only traffic type
+                Numa node   Numa node
+Numa node             0            1
+       0        28612.0      19216.8
+
+Measuring Loaded Latencies for the system
+Using all the threads from each core if Hyper-threading is enabled
+Using Read-only traffic type
+Inject  Latency Bandwidth
+Delay   (ns)    MB/sec
+==========================
+00000   370.12   28393.2
+00002   369.29   28435.4
+00008   378.41   28490.5
+00015   354.32   28414.2
+00050   313.07   28323.0
+00100   238.51   28010.5
+00200   125.13   14566.0
+00300   119.69   10232.0
+00400   116.76   7905.3
+00500   115.33   6500.4
+00700   113.89   4858.3
+01000   113.03   3594.6
+01300   112.57   2906.9
+01700   112.09   2363.9
+02500   111.51   1798.9
+03500   111.21   1520.8
+05000   110.77   1193.2
+09000   110.38   922.3
+20000   110.14   735.6
+
+Measuring cache-to-cache transfer latency (in ns)...
+Local Socket L2->L2 HIT  latency        67.3
+Local Socket L2->L2 HITM latency        67.5
\ No newline at end of file
diff --git a/conanfile.txt b/conanfile.txt
new file mode 100644
index 0000000..1b4945e
--- /dev/null
+++ b/conanfile.txt
@@ -0,0 +1,9 @@
+[requires]
+cxxopts/3.0.0
+fmt/9.0.0
+nlohmann_json/3.11.2
+[generators]
+CMakeDeps
+CMakeToolchain
+[layout]
+cmake_layout
\ No newline at end of file
diff --git a/include/alloc.h b/include/alloc.h
deleted file mode 100644
index e9a4cfb..0000000
--- a/include/alloc.h
+++ /dev/null
@@ -1,12 +0,0 @@
-//
-// Created by victoryang00 on 2/2/23.
-//
-
-#ifndef CXL_MEM_SIMULATOR_ALLOC_H
-#define CXL_MEM_SIMULATOR_ALLOC_H
-
-class Allocator {
-
-};
-
-#endif // CXL_MEM_SIMULATOR_ALLOC_H
diff --git a/include/cxlcontroller.h b/include/cxlcontroller.h
index f4d1246..351d2d2 100644
--- a/include/cxlcontroller.h
+++ b/include/cxlcontroller.h
@@ -2,43 +2,64 @@
 // Created by victoryang00 on 1/14/23.
 //
 
-#ifndef CXL_MEM_SIMULATOR_CXLCONTROLLER_H
-#define CXL_MEM_SIMULATOR_CXLCONTROLLER_H
+#ifndef CXLMEMSIM_CXLCONTROLLER_H
+#define CXLMEMSIM_CXLCONTROLLER_H
 
 #include "cxlcounter.h"
 #include "cxlendpoint.h"
 #include <cstdint>
 #include <string_view>
+#include <unordered_map>
 #include <vector>
 
+enum page_type { CACHELINE, PAGE, HUGEPAGE_2M, HUGEPAGE_1G };
+
 class CXLController;
-class Policy {
+class AllocationPolicy {
 public:
-    Policy();
+    AllocationPolicy();
     virtual int compute_once(CXLController *) = 0;
+    // No write problem
+};
+class MigrationPolicy {
+public:
+    MigrationPolicy();
+    virtual int compute_once(CXLController *) = 0; // reader writer
+    // paging related
+    // switching related
+};
+
+// need to give a timeout and will be added latency later,
+class PagingPolicy {
+public:
+    PagingPolicy();
+    virtual int compute_once(CXLController *) = 0; // reader writer
+    // paging related
 };
+
 class CXLController : CXLSwitch {
 public:
     std::vector<CXLMemExpander *> cur_expanders{};
     int capacity; // GB
-    Policy *policy;
+    AllocationPolicy *policy;
     CXLCounter counter;
     std::map<uint64_t, uint64_t> occupation;
     std::map<uint64_t, uint64_t> va_pa_map;
-    bool is_page;
+    enum page_type page_type_; // percentage
     int num_switches = 0;
-    CXLController(Policy *policy, int capacity, bool is_page, int epoch);
+
+    CXLController(AllocationPolicy *p, int capacity, enum page_type page_type_, int epoch);
     void construct_topo(std::string_view newick_tree);
     void insert_end_point(CXLMemExpander *end_point);
     std::vector<std::string> tokenize(const std::string_view &s);
-    std::tuple<double,std::vector<uint64_t>> calculate_congestion() override;
+    std::tuple<double, std::vector<uint64_t>> calculate_congestion() override;
     void set_epoch(int epoch) override;
     std::tuple<int, int> get_all_access() override;
-    double calculate_latency(LatencyPass elem); // traverse the tree to calculate the latency
-    double calculate_bandwidth(BandwidthPass elem);
+    double calculate_latency(LatencyPass elem) override; // traverse the tree to calculate the latency
+    double calculate_bandwidth(BandwidthPass elem) override;
     int insert(uint64_t timestamp, uint64_t phys_addr, uint64_t virt_addr, int index) override;
     void delete_entry(uint64_t addr, uint64_t length) override;
     std::string output() override;
 };
 
-#endif // CXL_MEM_SIMULATOR_CXLCONTROLLER_H
+#endif // CXLMEMSIM_CXLCONTROLLER_H
diff --git a/include/cxlcounter.h b/include/cxlcounter.h
index d4b0d09..116ecd3 100644
--- a/include/cxlcounter.h
+++ b/include/cxlcounter.h
@@ -2,16 +2,17 @@
 // Created by victoryang00 on 1/12/23.
 //
 
-#ifndef CXL_MEM_SIMULATOR_CXLCOUNTER_H
-#define CXL_MEM_SIMULATOR_CXLCOUNTER_H
+#ifndef CXLMEMSIM_CXLCOUNTER_H
+#define CXLMEMSIM_CXLCOUNTER_H
 
 #include <cstdint>
-#include <vector>
-#include <string>
 #include <map>
+#include <string>
 #include <tuple>
+#include <unordered_map>
+#include <vector>
 
-/** TODO: Whether to using the pebs to record the state. add back invalidation */
+/** TODO: Whether to using the pebs to record the state. add back invalidation migrate huge/ page and prefetch*/
 class CXLSwitchEvent {
 public:
     uint64_t load = 0;
@@ -51,4 +52,4 @@ class CXLCounter {
     void inc_hitm();
 };
 
-#endif // CXL_MEM_SIMULATOR_CXLCOUNTER_H
+#endif // CXLMEMSIM_CXLCOUNTER_H
diff --git a/include/cxlendpoint.h b/include/cxlendpoint.h
index ed94861..39c1f63 100644
--- a/include/cxlendpoint.h
+++ b/include/cxlendpoint.h
@@ -2,11 +2,55 @@
 // Created by victoryang00 on 1/13/23.
 //
 
-#ifndef CXL_MEM_SIMULATOR_CXLENDPOINT_H
-#define CXL_MEM_SIMULATOR_CXLENDPOINT_H
+#ifndef CXLMEMSIM_CXLENDPOINT_H
+#define CXLMEMSIM_CXLENDPOINT_H
 #include "cxlcounter.h"
 #include "helper.h"
 
+class LRUCache {
+    std::list<uint64_t> lru_list;
+    std::unordered_map<uint64_t, std::list<uint64_t>::iterator> lru_map;
+    std::unordered_map<uint64_t, uint64_t> wb_map;
+    size_t capacity;
+
+public:
+    LRUCache(size_t cap) : capacity(cap) {}
+
+    void insert(uint64_t key, uint64_t value) {
+        // Check if the item is already in the cache
+        if (lru_map.find(key) != lru_map.end()) {
+            // Move the element to the front of the list
+            lru_list.erase(lru_map[key]);
+            lru_list.push_front(key);
+            lru_map[key] = lru_list.begin();
+            wb_map[key] = value;
+        } else {
+            // If the cache is full, remove the least recently used item
+            if (lru_list.size() == capacity) {
+                uint64_t old_key = lru_list.back();
+                lru_list.pop_back();
+                lru_map.erase(old_key);
+                wb_map.erase(old_key);
+            }
+            // Insert the new item
+            lru_list.push_front(key);
+            lru_map[key] = lru_list.begin();
+            wb_map[key] = value;
+        }
+    }
+
+    uint64_t get(uint64_t key) {
+        if (lru_map.find(key) == lru_map.end()) {
+            throw std::runtime_error("Key not found");
+        }
+        // Move the accessed item to the front of the list
+        lru_list.erase(lru_map[key]);
+        lru_list.push_front(key);
+        lru_map[key] = lru_list.begin();
+        return wb_map[key];
+    }
+};
+
 class CXLEndPoint {
     virtual void set_epoch(int epoch) = 0;
     virtual std::string output() = 0;
@@ -27,6 +71,9 @@ class CXLMemExpander : public CXLEndPoint {
     std::map<uint64_t, uint64_t> va_pa_map; // va, pa
     CXLMemExpanderEvent counter{};
     CXLMemExpanderEvent last_counter{};
+
+    LRUCache lru_cache;
+    // tlb map and paging map -> invalidate
     int last_read = 0;
     int last_write = 0;
     double last_latency = 0.;
@@ -50,6 +97,9 @@ class CXLSwitch : public CXLEndPoint {
     int id = -1;
     int epoch = 0;
     uint64_t last_timestamp = 0;
+    // get the approximate congestion and target done time
+    std::unordered_map<uint64_t, uint64_t> timeseries_map;
+
     double congestion_latency = 0.02;
     explicit CXLSwitch(int id);
     std::tuple<int, int> get_all_access() override;
@@ -62,4 +112,4 @@ class CXLSwitch : public CXLEndPoint {
     void set_epoch(int epoch) override;
 };
 
-#endif // CXL_MEM_SIMULATOR_CXLENDPOINT_H
+#endif // CXLMEMSIM_CXLENDPOINT_H
diff --git a/include/helper.h b/include/helper.h
index 6330ef0..07d4810 100644
--- a/include/helper.h
+++ b/include/helper.h
@@ -2,12 +2,13 @@
 // Created by victoryang00 on 1/12/23.
 //
 
-#ifndef CXL_MEM_SIMULATOR_HELPER_H
-#define CXL_MEM_SIMULATOR_HELPER_H
+#ifndef CXLMEMSIM_HELPER_H
+#define CXLMEMSIM_HELPER_H
 
 #include "incore.h"
 #include "logging.h"
 #include "uncore.h"
+#include <csignal>
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
@@ -16,7 +17,7 @@
 #include <fnmatch.h>
 #include <linux/perf_event.h>
 #include <map>
-#include <signal.h>
+#include <optional>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <unistd.h>
@@ -28,6 +29,16 @@ class Incore;
 class Uncore;
 class Helper;
 
+struct PerfConfig {
+    std::string path_format_cha_type{};
+    std::array<std::tuple<std::string, uint64_t, uint64_t>, 4> cha{};
+    std::array<std::tuple<std::string, uint64_t, uint64_t>, 4> cpu{};
+};
+struct ModelContext {
+    uint32_t model{};
+    struct PerfConfig perf_conf;
+};
+
 struct EmuCXLLatency {
     double read;
     double write;
@@ -47,22 +58,16 @@ struct BandwidthPass {
 struct LatencyPass {
     std::tuple<int, int> all_access;
     double dramlatency;
-    double ma_ro;
-    double ma_wb;
+    uint64_t readonly;
+    uint64_t writeback;
 };
 
-struct CBOElem {
-    uint64_t llc_wb;
+struct CHAElem {
+    std::array<uint64_t, 4> cha;
 };
 
 struct CPUElem {
-    uint64_t all_dram_rds;
-    uint64_t cpu_l2stall_t;
-    uint64_t cpu_llcl_hits;
-    uint64_t cpu_llcl_miss;
-    uint64_t cpu_bandwidth_read;
-    uint64_t cpu_bandwidth_write;
-    std::map<uint64_t, uint64_t> cpu_munmap_address_length;
+    std::array<uint64_t, 4> cpu;
 };
 
 struct PEBSElem {
@@ -79,54 +84,39 @@ struct CPUInfo {
 
 struct Elem {
     struct CPUInfo cpuinfo;
-    struct CBOElem *cbos;
-    struct CPUElem *cpus;
+    std::vector<CHAElem> chas;
+    std::vector<CPUElem> cpus;
     struct PEBSElem pebs;
 };
 
 class PMUInfo {
 public:
-    std::vector<Uncore> cbos;
+    std::vector<Uncore> chas;
     std::vector<Incore> cpus;
     Helper *helper;
     PMUInfo(pid_t pid, Helper *h, struct PerfConfig *perf_config);
     ~PMUInfo();
     int start_all_pmcs();
     int stop_all_pmcs();
-    int freeze_counters_cbo_all();
-    int unfreeze_counters_cbo_all();
-};
-
-struct PerfConfig {
-    const char *path_format_cbo_type;
-    uint64_t cbo_config;
-    uint64_t all_dram_rds_config;
-    uint64_t all_dram_rds_config1;
-    uint64_t cpu_l2stall_config;
-    uint64_t cpu_llcl_hits_config;
-    uint64_t cpu_llcl_miss_config;
-    uint64_t cpu_bandwidth_read_config;
-    uint64_t cpu_bandwidth_write_config;
-};
-
-struct ModelContext {
-    uint32_t model;
-    struct PerfConfig perf_conf;
+    int freeze_counters_cha_all();
+    int unfreeze_counters_cha_all();
 };
 
 class Helper {
 public:
-    int cpu;
-    int cbo;
-    double cpu_freq;
-    PerfConfig perf_conf;
+    PerfConfig perf_conf{};
     Helper();
-    static int num_of_cpu();
-    static int num_of_cbo();
+    int cpu;
+    int cha;
+    std::vector<int> used_cpu;
+    std::vector<int> used_cha;
+    int num_of_cpu();
+    int num_of_cha();
     static void detach_children();
     static void noop_handler(int signum);
-    double cpu_frequency() const;
-    PerfConfig detect_model(uint32_t);
+    double cpu_frequency();
+    PerfConfig detect_model(uint32_t model, const std::vector<std::string> &perf_name,
+                            const std::vector<uint64_t> &perf_conf1, const std::vector<uint64_t> &perf_conf2);
 };
 
-#endif // CXL_MEM_SIMULATOR_HELPER_H
+#endif // CXLMEMSIM_HELPER_H
diff --git a/include/incore.h b/include/incore.h
index 6e34116..8338b22 100644
--- a/include/incore.h
+++ b/include/incore.h
@@ -1,38 +1,34 @@
 // Created by victoryang00 on 1/14/23.
 //
 
-#ifndef CXL_MEM_SIMULATOR_INCORE_H
-#define CXL_MEM_SIMULATOR_INCORE_H
+#ifndef CXLMEMSIM_INCORE_H
+#define CXLMEMSIM_INCORE_H
 #include "helper.h"
 #include "perf.h"
-#include <sys/types.h>
-class CXLController;
+#include <array>
+#include <cstdint>
+
+class CXLController; // TODO: need to be shm gotten
 union CPUID_INFO {
     int array[4];
     struct {
         unsigned int eax, ebx, ecx, edx;
     } reg;
 };
+/** This is a per cha metrics*/
 class Incore {
 public:
-    PerfInfo *perf[5];
+    std::array<PerfInfo *, 4> perf{nullptr}; // should only be 4 counters
     struct PerfConfig *perf_config;
-    Incore(const pid_t pid, const int cpu, struct PerfConfig *perf_config);
+    Incore(pid_t pid, int cpu, struct PerfConfig *perf_config);
     ~Incore() = default;
     int start();
     int stop();
-    void init_all_dram_rds(const pid_t pid, const int cpu);
-    void init_cpu_l2stall(const pid_t pid, const int cpu);
-    void init_cpu_llcl_hits(const pid_t pid, const int cpu);
-    void init_cpu_llcl_miss(const pid_t pid, const int cpu);
-    void init_cpu_mem_read(const pid_t pid, const int cpu);
-    void init_cpu_mem_write(const pid_t pid, const int cpu);
-    void init_cpu_ebpf(const pid_t pid, const int cpu);
 
-    int read_cpu_elems(struct CPUElem *cpu_elem);
+    ssize_t read_cpu_elems(struct CPUElem *cpu_elem);
 };
 
 void pcm_cpuid(unsigned leaf, CPUID_INFO *info);
 bool get_cpu_info(struct CPUInfo *);
 
-#endif // CXL_MEM_SIMULATOR_INCORE_H
+#endif // CXLMEMSIM_INCORE_H
diff --git a/include/logging.h b/include/logging.h
index 94fd004..14942a8 100644
--- a/include/logging.h
+++ b/include/logging.h
@@ -2,8 +2,8 @@
 // Created by victoryang00 on 1/13/23.
 //
 
-#ifndef CXL_MEM_SIMULATOR_LOGGING_H
-#define CXL_MEM_SIMULATOR_LOGGING_H
+#ifndef CXLMEMSIM_LOGGING_H
+#define CXLMEMSIM_LOGGING_H
 
 #include <algorithm>
 #include <cstdlib>
@@ -15,11 +15,22 @@
 #include <fstream>
 #include <iostream>
 #include <list>
+#include <ranges>
 #include <source_location>
 #include <sstream>
 #include <string>
+#include "sock.h"
 
-enum LogLevel { DEBUG = 0, INFO, WARNING, ERROR };
+/** Barry's work*/
+struct Enumerate : std::ranges::range_adaptor_closure<Enumerate> {
+    template <std::ranges::viewable_range R> constexpr auto operator()(R &&r) const {
+        return std::views::zip(std::views::iota(0), (R &&) r);
+    }
+};
+
+inline constexpr Enumerate enumerate;
+
+enum LogLevel { DEBUG = 0, INFO, WARNING, ERROR, TRACE };
 
 class LogStream;
 class LogWriter;
@@ -30,15 +41,17 @@ class LogWriter {
         char *logv = std::getenv("LOGV");
         if (logv) {
             env_log_level = std::stoi(logv);
+            file_ = std::fstream(OUTPUT_PMU_PATH, std::ios::out | std::ios::app);
         } else {
             env_log_level = 4;
         }
     };
-
+    ~LogWriter() = default;
     void operator<(const LogStream &stream);
 
 private:
     void output_log(const std::ostringstream &g);
+    std::fstream file_;
     std::source_location location_;
     LogLevel log_level_;
     int env_log_level;
@@ -66,8 +79,9 @@ fmt::color level2color(LogLevel level);
 #define LOG_IF(level) LogWriter(std::source_location::current(), level) < LogStream()
 #define LOG(level) LOG_##level
 #define LOG_DEBUG LOG_IF(DEBUG)
+#define LOG_TRACE LOG_IF(TRACE)
 #define LOG_INFO LOG_IF(INFO)
 #define LOG_WARNING LOG_IF(WARNING)
 #define LOG_ERROR LOG_IF(ERROR)
 
-#endif // CXL_MEM_SIMULATOR_LOGGING_H
+#endif // CXLMEMSIM_LOGGING_H
diff --git a/include/monitor.h b/include/monitor.h
index 8763d2c..9655476 100644
--- a/include/monitor.h
+++ b/include/monitor.h
@@ -2,8 +2,8 @@
 // Created by victoryang00 on 1/11/23.
 //
 
-#ifndef SLUGALLOCATOR_MONITOR_H
-#define SLUGALLOCATOR_MONITOR_H
+#ifndef CXLMEMSIM_MONITOR_H
+#define CXLMEMSIM_MONITOR_H
 
 #include "cxlcontroller.h"
 #include "helper.h"
@@ -22,46 +22,112 @@ enum MONITOR_STATUS {
     MONITOR_TERMINATED = 2,
     MONITOR_NOPERMISSION = 3,
     MONITOR_DISABLE = 4,
+    MONITOR_SUSPEND = 5,
     MONITOR_UNKNOWN = 0xff
 };
 
+extern Helper helper;
+
 class Monitor;
 class Monitors {
 public:
     std::vector<Monitor> mon;
-    Monitors(int tnum, cpu_set_t *use_cpuset, int nmem, Helper h);
+    bool print_flag;
+    Monitors(int tnum, cpu_set_t *use_cpuset);
     ~Monitors() = default;
 
-    void stop_all(const int);
-    void run_all(const int);
-    int enable(const uint32_t, const uint32_t, bool, uint64_t, const int32_t, bool is_page);
-    void disable(const uint32_t target);
-    int terminate(const uint32_t, const uint32_t, const int32_t);
-    bool check_all_terminated(const uint32_t);
-    bool check_continue(const uint32_t, const struct timespec);
+    void stop_all(int);
+    void run_all(int);
+    Monitor get_mon(int, int);
+    int enable(const uint32_t, const uint32_t, bool, uint64_t, const int32_t);
+    void disable(uint32_t target);
+    int terminate(uint32_t, uint32_t, int32_t);
+    bool check_all_terminated(uint32_t);
+    bool check_continue(uint32_t, struct timespec);
 };
 
 class Monitor {
 public:
-    pid_t tgid;
+    pid_t tgid; // process id
     pid_t tid;
     uint32_t cpu_core;
     char status;
     struct timespec injected_delay; // recorded time for injected
     struct timespec wasted_delay; // recorded time for calling between continue and calculation
     struct timespec squabble_delay; // inj-was
-    struct Elem elem[2];
+    struct Elem elem[2]; // before & after
     struct Elem *before, *after;
     double total_delay;
     struct timespec start_exec_ts, end_exec_ts;
     bool is_process;
     struct PEBS *pebs_ctx;
 
-    Monitor(const int nmem, Helper h);
+    explicit Monitor();
 
     void stop();
     void run();
-    void clear_time(struct timespec *);
+    static void clear_time(struct timespec *);
+};
+
+template <> struct fmt::formatter<Monitors> {
+    fmt::formatter<int> f;
+
+    constexpr auto parse(auto &ctx) { return f.parse(ctx); }
+
+    auto format(Monitors const &p, auto &ctx) const {
+        auto out = fmt::format_to(ctx.out(), "");
+        if (p.print_flag) {
+            for (auto const &[mon_id, mon] : p.mon | enumerate) {
+                for (auto core_idx = 0; core_idx < helper.used_cha.size(); core_idx++) {
+                    for (auto cha_idx = 0; cha_idx < helper.perf_conf.cha.size(); cha_idx++) {
+                        out = fmt::format_to(out, "mon{}_{}_{}_{},", mon_id, std::get<0>(helper.perf_conf.cha[cha_idx]),
+                                             helper.used_cha[core_idx], core_idx);
+                    }
+                }
+
+                for (auto core_idx = 0; core_idx < helper.used_cpu.size(); core_idx++) {
+                    for (auto cpu_idx = 0; cpu_idx < helper.perf_conf.cpu.size(); cpu_idx++) {
+                        if (cpu_idx == helper.perf_conf.cpu.size() - 1 && core_idx == helper.used_cpu.size() - 1) {
+                            out = fmt::format_to(out, "mon{}_{}_{}_{}", mon_id,
+                                                 std::get<0>(helper.perf_conf.cpu[cpu_idx]), helper.used_cpu[core_idx],
+                                                 core_idx);
+                        } else {
+                            out = fmt::format_to(out, "mon{}_{}_{}_{},", mon_id,
+                                                 std::get<0>(helper.perf_conf.cpu[cpu_idx]), helper.used_cpu[core_idx],
+                                                 core_idx);
+                        }
+                    }
+                }
+            }
+        } else {
+
+            for (auto const &[mon_id, mon] : p.mon | enumerate) {
+                for (auto core_idx = 0; core_idx < helper.used_cha.size(); core_idx++) {
+                    for (auto cha_idx = 0; cha_idx < helper.perf_conf.cha.size(); cha_idx++) {
+                        out = fmt::format_to(out, "{},",
+                                             mon.after->chas[core_idx].cha[cha_idx] -
+                                                 mon.before->chas[core_idx].cha[cha_idx]);
+                    }
+                }
+                for (auto core_idx = 0; core_idx < helper.used_cpu.size(); core_idx++) {
+                    for (auto cpu_idx = 0; cpu_idx < helper.perf_conf.cpu.size(); cpu_idx++) {
+                        if (cpu_idx == helper.perf_conf.cpu.size() - 1 && core_idx == helper.used_cpu.size() - 1) {
+                            out = fmt::format_to(out, "{}",
+                                                 mon.after->cpus[core_idx].cpu[cpu_idx] -
+                                                     mon.before->cpus[core_idx].cpu[cpu_idx]);
+                        } else {
+                            out = fmt::format_to(out, "{},",
+                                                 mon.after->cpus[core_idx].cpu[cpu_idx] -
+                                                     mon.before->cpus[core_idx].cpu[cpu_idx]);
+                        }
+                    }
+                }
+            } // visitor mode write to the file
+        }
+        //        *out++ = '\n';
+        ctx.advance_to(out);
+        return out;
+    };
 };
 
-#endif // SLUGALLOCATOR_MONITOR_H
+#endif // CXLMEMSIM_MONITOR_H
diff --git a/include/pebs.h b/include/pebs.h
index 03971cb..ba06035 100644
--- a/include/pebs.h
+++ b/include/pebs.h
@@ -2,12 +2,11 @@
 // Created by victoryang00 on 1/13/23.
 //
 
-#ifndef CXL_MEM_SIMULATOR_PEBS_H
-#define CXL_MEM_SIMULATOR_PEBS_H
+#ifndef CXLMEMSIM_PEBS_H
+#define CXLMEMSIM_PEBS_H
 
 #include "cxlcontroller.h"
 #include "helper.h"
-#include "logging.h"
 #include <asm/unistd.h>
 #include <cerrno>
 #include <csignal>
@@ -33,14 +32,13 @@ class PEBS {
     uint64_t sample_period;
     uint32_t seq{};
     size_t rdlen{};
-    size_t mplen;
+    size_t mplen{};
     struct perf_event_mmap_page *mp;
-    bool is_page;
-    PEBS(pid_t, uint64_t, bool);
+    PEBS(pid_t, uint64_t);
     ~PEBS();
     int read(CXLController *, struct PEBSElem *);
     int start();
     int stop();
 };
 
-#endif // CXL_MEM_SIMULATOR_PEBS_H
+#endif // CXLMEMSIM_PEBS_H
diff --git a/include/perf.h b/include/perf.h
index 126418c..1e7568d 100644
--- a/include/perf.h
+++ b/include/perf.h
@@ -2,8 +2,8 @@
 // Created by victoryang00 on 1/14/23.
 //
 
-#ifndef CXL_MEM_SIMULATOR_PERF_H
-#define CXL_MEM_SIMULATOR_PERF_H
+#ifndef CXLMEMSIM_PERF_H
+#define CXLMEMSIM_PERF_H
 
 #include <bpf/bpf.h>
 #include <cstdint>
@@ -23,33 +23,6 @@
 #include <tuple>
 #include <unistd.h>
 
-class ThreadSafeMap {
-public:
-    ThreadSafeMap() = default;
-
-    // Multiple threads/readers can read the Map's value at the same time.
-    std::map<unsigned long, std::tuple<unsigned long, unsigned long long int>> get() const {
-        std::shared_lock lock(mutex_);
-        return res;
-    }
-
-    // Only one thread/writer can increment/write the Map's value.
-    void insert(unsigned long address, unsigned long size, unsigned long long time) {
-        std::unique_lock lock(mutex_);
-        res[address] = std::make_tuple(size, time);
-    }
-
-    // Only one thread/writer can reset/write the Map's value.
-    void reset() {
-        std::unique_lock lock(mutex_);
-        res.clear();
-    }
-
-private:
-    mutable std::shared_mutex mutex_;
-    std::map<unsigned long, std::tuple<unsigned long, unsigned long long>> res;
-};
-
 class PerfInfo {
 public:
     int fd;
@@ -58,18 +31,14 @@ class PerfInfo {
     pid_t pid;
     unsigned long flags;
     struct perf_event_attr attr;
-    ThreadSafeMap *map;
-    std::jthread j;
+    PerfInfo() = default;
     PerfInfo(int group_fd, int cpu, pid_t pid, unsigned long flags, struct perf_event_attr attr);
-    PerfInfo(int fd, int group_fd, int cpu, pid_t pid, unsigned long flags, struct perf_event_attr attr);
     ~PerfInfo();
     ssize_t read_pmu(uint64_t *value);
-    std::map<uint64_t, uint64_t> read_trace_pipe();
     int start();
     int stop();
 };
 
 PerfInfo *init_incore_perf(const pid_t pid, const int cpu, uint64_t conf, uint64_t conf1);
-PerfInfo *init_incore_bpf_perf(const pid_t pid, const int cpu);
-void write_trace_to_map(ThreadSafeMap *map);
-#endif // CXL_MEM_SIMULATOR_PERF_H
+PerfInfo *init_uncore_perf(const pid_t pid, const int cpu, uint64_t conf, uint64_t conf1, int value);
+#endif // CXLMEMSIM_PERF_H
diff --git a/include/policy.h b/include/policy.h
index 0039ee9..dae51e7 100644
--- a/include/policy.h
+++ b/include/policy.h
@@ -2,8 +2,8 @@
 // Created by victoryang00 on 1/12/23.
 //
 
-#ifndef CXL_MEM_SIMULATOR_POLICY_H
-#define CXL_MEM_SIMULATOR_POLICY_H
+#ifndef CXLMEMSIM_POLICY_H
+#define CXLMEMSIM_POLICY_H
 #include "cxlcontroller.h"
 #include "cxlendpoint.h"
 #include "helper.h"
@@ -11,7 +11,7 @@
 
 // Saturate Local 90% and start interleave accrodingly the remote with topology
 // Say 3 remote, 2 200ns, 1 400ns, will give 40% 40% 20%
-class InterleavePolicy : public Policy {
+class InterleavePolicy : public AllocationPolicy {
 
 public:
     InterleavePolicy();
@@ -21,4 +21,4 @@ class InterleavePolicy : public Policy {
     int compute_once(CXLController *) override;
 };
 
-#endif // CXL_MEM_SIMULATOR_POLICY_H
+#endif // CXLMEMSIM_POLICY_H
diff --git a/include/sock.h b/include/sock.h
new file mode 100644
index 0000000..8618a20
--- /dev/null
+++ b/include/sock.h
@@ -0,0 +1,29 @@
+//
+// Created by root on 11/21/23.
+//
+
+#ifndef CXLMEMSIM_SOCK_H
+#define CXLMEMSIM_SOCK_H
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+enum opcode {
+    CXLMEMSIM_PROCESS_CREATE = 0,
+    CXLMEMSIM_THREAD_CREATE = 1,
+    CXLMEMSIM_THREAD_EXIT = 2,
+    CXLMEMSIM_STABLE_SIGNAL = 3,
+};
+struct op_data {
+    uint32_t tgid;
+    uint32_t tid;
+    uint32_t opcode;
+};
+#define SOCKET_PATH "/tmp/cxl_mem_simulator.sock"
+#define OUTPUT_PMU_PATH "./output_pmu.csv"
+
+#ifdef __cplusplus
+}
+#endif
+#endif // CXLMEMSIM_SOCK_H
diff --git a/include/uncore.h b/include/uncore.h
index b83bcee..d12e1ce 100644
--- a/include/uncore.h
+++ b/include/uncore.h
@@ -2,21 +2,24 @@
 // Created by victoryang00 on 1/12/23.
 //
 
-#ifndef CXL_MEM_SIMULATOR_UNCORE_H
-#define CXL_MEM_SIMULATOR_UNCORE_H
+#ifndef CXLMEMSIM_UNCORE_H
+#define CXLMEMSIM_UNCORE_H
 #include "helper.h"
 #include "perf.h"
+#include <array>
 #include <cstdint>
 
 struct PerfConfig;
 class Uncore {
 public:
-    uint32_t unc_idx;
-    PerfInfo *perf;
-    Uncore(const uint32_t unc_idx, PerfConfig *perf_config);
+    uint32_t unc_idx{};
+    int fd{};
+    std::array<PerfInfo *, 4> perf{nullptr, nullptr, nullptr, nullptr};
+    Uncore(uint32_t unc_idx, PerfConfig *perf_config);
+
     ~Uncore() = default;
 
-    int read_cbo_elems(struct CBOElem *elem);
+    int read_cha_elems(struct CHAElem *elem);
 };
 
-#endif // CXL_MEM_SIMULATOR_UNCORE_H
+#endif // CXLMEMSIM_UNCORE_H
diff --git a/microbench/CMakeLists.txt b/microbench/CMakeLists.txt
index aeac01f..5ee4d65 100644
--- a/microbench/CMakeLists.txt
+++ b/microbench/CMakeLists.txt
@@ -1,5 +1,67 @@
-add_executable(many_calloc ./many_calloc.c)
-add_executable(many_mmap_read ./many_mmap_read.c)
-add_executable(many_mmap_write ./many_mmap_write.c)
-add_executable(many_malloc ./many_malloc.c)
-add_executable(many_sbrk ./many_sbrk.c)
\ No newline at end of file
+add_executable(calloc calloc.c)
+add_executable(mmap_read mmap_read.c)
+add_executable(mmap_write mmap_write.c)
+add_executable(malloc malloc.c)
+add_executable(sbrk sbrk.c)
+
+add_executable(ld_simple ld_simple.cpp)
+add_executable(nt-ld nt-ld.cpp)
+add_executable(nt-st nt-st.cpp)
+add_executable(ptr-chasing ptr-chasing.cpp)
+
+add_executable(ld1 ld.cpp)
+target_compile_definitions(ld1 PRIVATE -DFENCE_COUNT=1)
+add_executable(ld2 ld.cpp)
+target_compile_definitions(ld2 PRIVATE -DFENCE_COUNT=2)
+add_executable(ld4 ld.cpp)
+target_compile_definitions(ld4 PRIVATE -DFENCE_COUNT=4)
+add_executable(ld8 ld.cpp)
+target_compile_definitions(ld8 PRIVATE -DFENCE_COUNT=8)
+add_executable(ld16 ld.cpp)
+target_compile_definitions(ld16 PRIVATE -DFENCE_COUNT=16)
+add_executable(ld32 ld.cpp)
+target_compile_definitions(ld32 PRIVATE -DFENCE_COUNT=32)
+add_executable(ld64 ld.cpp)
+target_compile_definitions(ld64 PRIVATE -DFENCE_COUNT=64)
+add_executable(ld128 ld.cpp)
+target_compile_definitions(ld128 PRIVATE -DFENCE_COUNT=128)
+add_executable(ld256 ld.cpp)
+target_compile_definitions(ld256 PRIVATE -DFENCE_COUNT=256)
+
+add_executable(ld_base1 ld_base.cpp)
+target_compile_definitions(ld_base1 PRIVATE -DFENCE_COUNT=1)
+add_executable(ld_base2 ld_base.cpp)
+target_compile_definitions(ld_base2 PRIVATE -DFENCE_COUNT=2)
+add_executable(ld_base4 ld_base.cpp)
+target_compile_definitions(ld_base4 PRIVATE -DFENCE_COUNT=4)
+add_executable(ld_base8 ld_base.cpp)
+target_compile_definitions(ld_base8 PRIVATE -DFENCE_COUNT=8)
+add_executable(ld_base16 ld_base.cpp)
+target_compile_definitions(ld_base16 PRIVATE -DFENCE_COUNT=16)
+add_executable(ld_base32 ld_base.cpp)
+target_compile_definitions(ld_base32 PRIVATE -DFENCE_COUNT=32)
+add_executable(ld_base64 ld_base.cpp)
+target_compile_definitions(ld_base64 PRIVATE -DFENCE_COUNT=64)
+add_executable(ld_base128 ld_base.cpp)
+target_compile_definitions(ld_base128 PRIVATE -DFENCE_COUNT=128)
+add_executable(ld_base256 ld_base.cpp)
+target_compile_definitions(ld_base256 PRIVATE -DFENCE_COUNT=256)
+
+add_executable(st1 st.cpp)
+target_compile_definitions(st1 PRIVATE -DFENCE_COUNT=1)
+add_executable(st2 st.cpp)
+target_compile_definitions(st2 PRIVATE -DFENCE_COUNT=2)
+add_executable(st4 st.cpp)
+target_compile_definitions(st4 PRIVATE -DFENCE_COUNT=4)
+add_executable(st8 st.cpp)
+target_compile_definitions(st8 PRIVATE -DFENCE_COUNT=8)
+add_executable(st16 st.cpp)
+target_compile_definitions(st16 PRIVATE -DFENCE_COUNT=16)
+add_executable(st32 st.cpp)
+target_compile_definitions(st32 PRIVATE -DFENCE_COUNT=32)
+add_executable(st64 st.cpp)
+target_compile_definitions(st64 PRIVATE -DFENCE_COUNT=64)
+add_executable(st128 st.cpp)
+target_compile_definitions(st128 PRIVATE -DFENCE_COUNT=128)
+add_executable(st256 st.cpp)
+target_compile_definitions(st256 PRIVATE -DFENCE_COUNT=256)
diff --git a/microbench/many_calloc.c b/microbench/calloc.c
similarity index 100%
rename from microbench/many_calloc.c
rename to microbench/calloc.c
diff --git a/microbench/ld.cpp b/microbench/ld.cpp
new file mode 100644
index 0000000..d597a58
--- /dev/null
+++ b/microbench/ld.cpp
@@ -0,0 +1,119 @@
+/*
+ * Microbench testies for MLP and memory latency in CXLMS
+ *
+ *  By: Andrew Quinn
+ *      Yiwei Yang
+ *
+ *  Copyright 2023 Regents of the Univeristy of California
+ *  UC Santa Cruz Sluglab.
+ */
+
+
+#include <errno.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <cpuid.h>
+#include <pthread.h>
+#include <stdlib.h>
+
+#include <sys/mman.h>
+
+
+#define STR_HELPER(x) #x
+#define STR(x) STR_HELPER(x)
+
+#define MOVE_SIZE 128
+#define MAP_SIZE  (long)(1024 * 1024 * 1024)
+#define CACHELINE_SIZE  64
+
+#ifndef FENCE_COUNT
+#define FENCE_COUNT 8
+#endif
+
+#define FENCE_BOUND (FENCE_COUNT * MOVE_SIZE)
+
+// we need to jump in MOVE_SIZE increments otherwise segfault!
+
+#define BODY(start)						\
+  "xor %%r8, %%r8 \n"						\
+  "LOOP_START%=: \n"						\
+  "lea (%[" #start "], %%r8), %%r9 \n"				\
+  "movdqa  (%%r9), %%xmm0 \n"					\
+  "add $" STR(MOVE_SIZE) ", %%r8 \n"				\
+  "cmp $" STR(FENCE_BOUND) ",%%r8\n"				\
+  "jl LOOP_START%= \n"						\
+  "mfence \n"
+
+
+int main(int argc, char **argv) {
+
+  // in principle, you would want to clear out cache lines (and the
+  // pipeline) before doing any of the inline assembly stuff.  But,
+  // that's hard.  And, its probably noise when you execute over
+  // enough things.
+
+
+  // allocate some meomery
+  char *base =(char *) mmap(nullptr,
+		    MAP_SIZE,
+		    PROT_READ | PROT_WRITE,
+		    MAP_ANONYMOUS | MAP_PRIVATE,
+		    -1,
+		    0);
+
+  if (base == MAP_FAILED) {
+    fprintf(stderr, "oops, you suck %d\n", errno);
+    return -1;
+  }
+  char *addr = NULL;
+
+  intptr_t *iaddr = (intptr_t*) base;
+  intptr_t hash = 0;
+  struct timespec tstart = {0,0}, tend = {0,0};
+
+  // Necessary so that we don't include allocation costs in our benchmark
+  while (iaddr < (intptr_t *)(base + MAP_SIZE)) {
+    hash = hash ^ (intptr_t) iaddr;
+    *iaddr = hash;
+    iaddr++;
+  }
+
+  // should flush everything from the cache. But, how big is the cache?
+  addr = base;
+  while (addr < (base + MAP_SIZE)) {
+    asm volatile(
+		 "mov %[buf], %%rsi\n"
+		 "clflush (%%rsi)\n"
+		 :
+		 : [buf] "r" (addr)
+		 : "rsi");
+    addr += CACHELINE_SIZE;
+  }
+
+  asm volatile ("mfence\n" :::);
+
+  clock_gettime(CLOCK_MONOTONIC, &tstart);
+for (int i=0;i<1e3;i++){
+  addr = base;
+  while (addr < (base + MAP_SIZE)) {
+    //fprintf (stderr, "addr %p bound %p\n", addr, base + MAP_SIZE);
+    asm volatile(
+		 BODY(addr)
+		 :
+		 : [addr] "r" (addr)
+		 : "r8", "r9", "xmm0");
+
+      addr += (FENCE_COUNT * MOVE_SIZE);
+  }
+  clock_gettime(CLOCK_MONOTONIC, &tend);
+  uint64_t nanos = (1000000000  * tend.tv_sec + tend.tv_nsec);
+  nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec);
+
+
+  printf("%lu\n", nanos);
+}
+  return 0;
+}
diff --git a/microbench/ld_base.cpp b/microbench/ld_base.cpp
new file mode 100644
index 0000000..9a299ce
--- /dev/null
+++ b/microbench/ld_base.cpp
@@ -0,0 +1,116 @@
+/*
+ * Microbench testies for MLP and memory latency in CXLMS
+ *
+ *  By: Andrew Quinn
+ *      Yiwei Yang
+ *
+ *  Copyright 2023 Regents of the Univeristy of California
+ *  UC Santa Cruz Sluglab.
+ */
+
+
+#include <errno.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <cpuid.h>
+#include <pthread.h>
+#include <stdlib.h>
+
+#include <sys/mman.h>
+
+
+#define STR_HELPER(x) #x
+#define STR(x) STR_HELPER(x)
+
+#define MOVE_SIZE 128
+#define MAP_SIZE  (long)(1024 * 1024 * 1024)
+#define CACHELINE_SIZE  64
+
+#ifndef FENCE_COUNT
+#define FENCE_COUNT 8
+#endif
+
+#define FENCE_BOUND (FENCE_COUNT * MOVE_SIZE)
+
+// we need to jump in MOVE_SIZE increments otherwise segfault!
+
+#define BODY(start)						\
+  "xor %%r8, %%r8 \n"						\
+  "LOOP_START%=: \n"						\
+  "lea (%[" #start "], %%r8), %%r9 \n"				\
+  "add $" STR(MOVE_SIZE) ", %%r8 \n"				\
+  "cmp $" STR(FENCE_BOUND) ",%%r8\n"				\
+  "jl LOOP_START%= \n"						\
+  "mfence \n"
+
+
+int main(int argc, char **argv) {
+
+  // in principle, you would want to clear out cache lines (and the
+  // pipeline) before doing any of the inline assembly stuff.  But,
+  // that's hard.  And, its probably noise when you execute over
+  // enough things.
+
+
+  // allocate some meomery
+  char *base =(char *) mmap(nullptr,
+		    MAP_SIZE,
+		    PROT_READ | PROT_WRITE,
+		    MAP_ANONYMOUS | MAP_PRIVATE,
+		    -1,
+		    0);
+
+  if (base == MAP_FAILED) {
+    fprintf(stderr, "oops, you suck %d\n", errno);
+    return -1;
+  }
+  char *addr = NULL;
+
+  intptr_t *iaddr = (intptr_t*) base;
+  intptr_t hash = 0;
+  struct timespec tstart = {0,0}, tend = {0,0};
+
+  // Necessary so that we don't include allocation costs in our benchmark
+  while (iaddr < (intptr_t *)(base + MAP_SIZE)) {
+    hash = hash ^ (intptr_t) iaddr;
+    *iaddr = hash;
+    iaddr++;
+  }
+
+  // should flush everything from the cache. But, how big is the cache?
+  addr = base;
+  while (addr < (base + MAP_SIZE)) {
+    asm volatile(
+		 "mov %[buf], %%rsi\n"
+		 "clflush (%%rsi)\n"
+		 :
+		 : [buf] "r" (addr)
+		 : "rsi");
+    addr += CACHELINE_SIZE;
+  }
+
+  asm volatile ("mfence\n" :::);
+
+  clock_gettime(CLOCK_MONOTONIC, &tstart);
+  addr = base;
+  while (addr < (base + MAP_SIZE)) {
+    //fprintf (stderr, "addr %p bound %p\n", addr, base + MAP_SIZE);
+    asm volatile(
+		 BODY(addr)
+		 :
+		 : [addr] "r" (addr)
+		 : "r8", "r9", "xmm0");
+
+      addr += (FENCE_COUNT * MOVE_SIZE);
+  }
+  clock_gettime(CLOCK_MONOTONIC, &tend);
+  uint64_t nanos = (1000000000  * tend.tv_sec + tend.tv_nsec);
+  nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec);
+
+
+  printf("%lu\n", nanos);
+  return 0;
+}
diff --git a/microbench/ld_simple.cpp b/microbench/ld_simple.cpp
new file mode 100644
index 0000000..282d9f6
--- /dev/null
+++ b/microbench/ld_simple.cpp
@@ -0,0 +1,97 @@
+/* **********************************************************
+ * Copyright (c) 2018-2023 Google LLC  All rights reserved.
+ * **********************************************************/
+
+/*
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of Google, Inc. nor the names of its contributors may be
+ *   used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE LLC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/* This microbenchmark suffers from a significant number of last-level cache
+ * (LLC) misses. SW prefetching can significantly improve its performance.
+ *
+ * The cache miss analyzer can be used to identify the load instruction that
+ * is suffering from most of the LLC misses in this microbenchmark. The analyzer
+ * can also produce prefetching hints for this microbenchmark. To run the
+ * analyzer on this microbenchmark and write the prefetching hints in a text
+ * file called "rec.csv", perform the following:
+ * * Compile the microbenchmark. Assuming g++ is the compiler being used:
+ *   $ g++ -O3 -o stride_benchmark stride_benchmark.cpp
+ * * Run the analyzer:
+ *   $ bin64/drrun -t drcachesim -simulator_type miss_analyzer -LL_miss_file rec.csv -- \
+ *     stride_benchmark
+ *
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <iostream>
+
+#define MEM_BARRIER() __asm__ __volatile__("" ::: "memory")
+
+int
+main(int argc, const char *argv[])
+{
+    // Cache line size in bytes.
+    const int kLineSize = 64;
+    // Number of cache lines skipped by the stream every iteration.
+    const int kStride = 7;
+    // Number of 1-byte elements in the array.
+    const size_t kArraySize = 1024 * 1024 * 1024;
+    // Number of iterations in the main loop.
+    const int kIterations = 1e4;
+    // The main vector/array used for emulating pointer chasing.
+    unsigned char *buffer = new unsigned char[kArraySize];
+    memset(buffer, kStride, kArraySize);
+
+    // Add a memory barrier so the call doesn't get optimized away or
+    // reordered with respect to callers.
+    MEM_BARRIER();
+
+    int position = 0;
+
+    // Here the code will pointer chase through the array skipping forward
+    // kStride cache lines at a time. Since kStride is an odd number, the main
+    // loop will touch different cache lines as it wraps around.
+    for (int loop = 0; loop < kIterations; ++loop) {
+        // This prefetching instruction results in a speedup of >2x
+        // on a Skylake machine running Linux when compiled with g++ -O3.
+      //const int prefetch_distance = 5 * kStride * kLineSize;
+      //__builtin_prefetch(&buffer[position + prefetch_distance], 0, 0);
+
+        position += (buffer[position] * kLineSize);
+	position = loop;
+        position &= (kArraySize - 1);
+    }
+
+    // Add a memory barrier so the call doesn't get optimized away or
+    // reordered with respect to callers.
+    MEM_BARRIER();
+
+//    std::cerr << "Value = " << position << std::endl;
+
+    return 0;
+}
diff --git a/microbench/many_malloc.c b/microbench/malloc.c
similarity index 100%
rename from microbench/many_malloc.c
rename to microbench/malloc.c
diff --git a/microbench/many_mmap_read.c b/microbench/mmap_read.c
similarity index 100%
rename from microbench/many_mmap_read.c
rename to microbench/mmap_read.c
diff --git a/microbench/many_mmap_write.c b/microbench/mmap_write.c
similarity index 100%
rename from microbench/many_mmap_write.c
rename to microbench/mmap_write.c
diff --git a/microbench/nt-ld.cpp b/microbench/nt-ld.cpp
new file mode 100644
index 0000000..4792322
--- /dev/null
+++ b/microbench/nt-ld.cpp
@@ -0,0 +1,119 @@
+#include "uarch.h"
+
+int main() {
+    int i;
+    long long aggregated = 0, aggregated2 = 0;
+    long seed = 0xdeadbeef1245678;
+    uint64_t a = 0xfc0;
+    int access_size = 64;
+    int stride_size = 64;
+    int delay = 64;
+    int count = 32;
+    uint64_t *cindex;
+    uint64_t csize;
+    int ret;
+
+    //    for (i = 0; i < 100000; i++) {
+    //        char *buf = malloc(4096 * 1024);
+    //        buf = buf + 64 - (((long)buf) % 64);
+    //        // Separate RaW job
+    //        RAW_BEFORE_WRITE
+    //        stride_storeclwb(buf, access_size, stride_size, delay, count);
+    //        asm volatile("mfence \n" :::);
+    //        RAW_BEFORE_READ
+    //        stride_nt(buf, access_size, stride_size, delay, count);
+    //        asm volatile("mfence \n" :::);
+    //        RAW_FINAL("raw-separate")
+    //
+    //        aggregated += diff;
+    //        aggregated2 += c_ntload_end - c_store_start;
+    //    }
+    //
+    //    printf("Separate RaW job %lld %lld\n", aggregated / 100000 / count, aggregated2 / 100000 / count);
+    aggregated = 0;
+    aggregated2 = 0;
+    for (i = 0; i < 100000; i++) {
+        char *buf = static_cast<char *>(malloc(4096 * 1024));
+        buf = buf + 64 - (((long)buf) % 64);
+        // Naive RaW job
+        RAW_BEFORE_WRITE
+        RAW_BEFORE_READ
+        stride_read_after_write(buf, access_size, stride_size, delay, count);
+        asm volatile("mfence \n" :::);
+        RAW_FINAL("raw-combined")
+        aggregated += diff;
+        aggregated2 += c_ntload_end - c_store_start;
+    }
+    printf("Naive RaW job %lld %lld\n", aggregated / 100000 / count, aggregated2 / 100000 / count);
+    aggregated = 0;
+    aggregated2 = 0;
+
+    //    for (i = 0; i < 100000; i++) {
+    //        char *buf = malloc(4096 * 1024);
+    //        buf = buf + 64 - (((long)buf) % 64);
+    //        RAW_BEFORE_WRITE
+    //        sizebw_storeclwb(buf, access_size, count, &seed, a);
+    //        asm volatile("mfence \n" :::);
+    //        RAW_FINAL("sizebw_storeclwb")
+    //        aggregated += diff;
+    //        aggregated2 += c_ntload_end - c_store_start;
+    //    }
+    //    printf("sizebw_storeclwb %lld %lld\n", aggregated / 100000 / count, aggregated2 / 100000 / count);
+    //
+    //    aggregated = 0;
+    //    aggregated2 = 0;
+    //    for (i = 0; i < 100000; i++) {
+    //        char *buf = malloc(4096 * 1024);
+    //        buf = buf + 64 - (((long)buf) % 64);
+    //        char *virt_addr = malloc(4096 * 1024);
+    //        virt_addr = virt_addr + 64 - (((long)virt_addr) % 64);
+    //        // Pointer chasing RaW job
+    //        // No need to fill report fs page table, init_chasing_index will do that
+    //        csize = access_size / CACHELINE_SIZE;
+    //        cindex = (uint64_t *)(virt_addr);
+    //        ret = init_chasing_index(cindex, csize);
+    //
+    //        RAW_BEFORE_WRITE
+    //        chasing_storeclwb(buf, access_size, stride_size, count, cindex);
+    //        asm volatile("mfence \n" :::);
+    //        RAW_BEFORE_READ
+    //        chasing_loadnt(buf, access_size, stride_size, count, cindex);
+    //        asm volatile("mfence \n" :::);
+    //        RAW_FINAL("raw-chasing")
+    //
+    //        aggregated += diff;
+    //        aggregated2 += c_ntload_end - c_store_start;
+    //    }
+    //    printf("pointer chasing 2 hop %lld %lld\n", aggregated / 100000 / count, aggregated2 / 100000 / count);
+    //
+    //    aggregated = 0;
+    //    aggregated2 = 0;
+    //    for (i = 0; i < 100000; i++) {
+    //        char *buf = malloc(4096 * 1024);
+    //        buf = buf + 64 - (((long)buf) % 64);
+    //        char *virt_addr = malloc(4096 * 1024);
+    //        virt_addr = virt_addr + 64 - (((long)virt_addr) % 64);
+    //        // Pointer chasing RaW job
+    //        // No need to fill report fs page table, init_chasing_index will do that
+    //        csize = access_size / CACHELINE_SIZE;
+    //        cindex = (uint64_t *)(virt_addr);
+    //        ret = init_chasing_index(cindex, csize);
+    //
+    //        RAW_BEFORE_WRITE
+    //        chasing_storeclwb(buf, access_size, stride_size, count, cindex);
+    //        asm volatile("mfence \n" :::);
+    //        RAW_BEFORE_READ
+    //        chasing_loadnt(buf, access_size, stride_size, count, cindex);
+    //        asm volatile("mfence \n" :::);
+    //        chasing_storeclwb(buf, access_size, stride_size, count, cindex);
+    //        asm volatile("mfence \n" :::);
+    //        chasing_loadnt(buf, access_size, stride_size, count, cindex);
+    //        asm volatile("mfence \n" :::);
+    //        RAW_FINAL("raw-chasing")
+    //
+    //        aggregated += diff;
+    //        aggregated2 += c_ntload_end - c_store_start;
+    //    }
+    //    printf("pointer chasing 4 hop %lld %lld\n", aggregated / 100000 / count, aggregated2 / 100000 / count);
+    return 0;
+}
diff --git a/microbench/nt-st.cpp b/microbench/nt-st.cpp
new file mode 100644
index 0000000..3f03481
--- /dev/null
+++ b/microbench/nt-st.cpp
@@ -0,0 +1,34 @@
+#include "uarch.h"
+
+int main() {
+    int i;
+    long long aggregated = 0, aggregated2 = 0;
+    long seed = 0xdeadbeef1245678;
+    uint64_t a = 0xfc0;
+    int access_size = 64;
+    int stride_size = 64;
+    int delay = 64;
+    int count = 32;
+    uint64_t *cindex;
+    uint64_t csize;
+    int ret;
+
+    for (i = 0; i < 100000; i++) {
+        char *buf = static_cast<char *>(malloc(4096 * 1024));
+        buf = buf + 64 - (((long)buf) % 64);
+        // Separate RaW job
+        RAW_BEFORE_WRITE
+        stride_storeclwb(buf, access_size, stride_size, delay, count);
+        asm volatile("mfence \n" :::);
+        RAW_BEFORE_READ
+        stride_nt(buf, access_size, stride_size, delay, count);
+        asm volatile("mfence \n" :::);
+        RAW_FINAL("raw-separate")
+
+        aggregated += diff;
+        aggregated2 += c_ntload_end - c_store_start;
+    }
+
+        printf("Separate RaW job %lld %lld\n", aggregated / 100000 / count, aggregated2 / 100000 / count);
+    return 0;
+}
diff --git a/microbench/ptr-chasing.cpp b/microbench/ptr-chasing.cpp
new file mode 100644
index 0000000..ba5d7c2
--- /dev/null
+++ b/microbench/ptr-chasing.cpp
@@ -0,0 +1,46 @@
+#include "uarch.h"
+
+int main() {
+    int i;
+    long long aggregated = 0, aggregated2 = 0;
+    long seed = 0xdeadbeef1245678;
+    uint64_t a = 0xfc0;
+    int access_size = 64;
+    int stride_size = 64;
+    int delay = 64;
+    int count = 32;
+    uint64_t *cindex;
+    uint64_t csize;
+    int ret;
+
+    aggregated = 0;
+    aggregated2 = 0;
+    for (i = 0; i < 100000; i++) {
+        char *buf = static_cast<char *>(malloc(4096 * 1024));
+        buf = buf + 64 - (((long)buf) % 64);
+        char *virt_addr = static_cast<char *>(malloc(4096 * 1024));
+        virt_addr = virt_addr + 64 - (((long)virt_addr) % 64);
+        // Pointer chasing RaW job
+        // No need to fill report fs page table, init_chasing_index will do that
+        csize = access_size / CACHELINE_SIZE;
+        cindex = (uint64_t *)(virt_addr);
+        ret = init_chasing_index(cindex, csize);
+
+        RAW_BEFORE_WRITE
+        chasing_storeclwb(buf, access_size, stride_size, count, cindex);
+        asm volatile("mfence \n" :::);
+        RAW_BEFORE_READ
+        chasing_loadnt(buf, access_size, stride_size, count, cindex);
+        asm volatile("mfence \n" :::);
+        chasing_storeclwb(buf, access_size, stride_size, count, cindex);
+        asm volatile("mfence \n" :::);
+        chasing_loadnt(buf, access_size, stride_size, count, cindex);
+        asm volatile("mfence \n" :::);
+        RAW_FINAL("raw-chasing")
+
+        aggregated += diff;
+        aggregated2 += c_ntload_end - c_store_start;
+    }
+    printf("pointer chasing 4 hop %lld %lld\n", aggregated / 100000 / count, aggregated2 / 100000 / count);
+    return 0;
+}
diff --git a/microbench/many_sbrk.c b/microbench/sbrk.c
similarity index 95%
rename from microbench/many_sbrk.c
rename to microbench/sbrk.c
index 1da226e..e1b53dd 100644
--- a/microbench/many_sbrk.c
+++ b/microbench/sbrk.c
@@ -136,9 +136,9 @@ void my_free(void *p) {
         if (ptr->succ != 0)
             ptr->succ->prev = pred;
         // end added
-        printf("BKR freeing block %#x merging with predecessor new size is %d.\n", p, pred->size);
+        printf("BKR freeing block %#p merging with predecessor new size is %d.\n", p, pred->size);
     } else {
-        printf("BKR freeing block %#x.\n", p);
+        printf("BKR freeing block %#p.\n", p);
         arr[i++] = ptr;
         ptr->isfree = 1;
         pred = ptr;
@@ -153,7 +153,7 @@ void my_free(void *p) {
             succ->succ->prev = pred;
         // end added
         arr[i++] = ptr;
-        printf("BKR freeing block %#x merging with successor new size is %d.\n", p, pred->size);
+        printf("BKR freeing block %#p merging with successor new size is %d.\n", p, pred->size);
     }
 }
 
@@ -161,7 +161,7 @@ int main(int argc, const char *const *argv) {
 
     size_t mbcount = 100;
 
-    printf("allocating %d MB\n", mbcount);
+    printf("allocating %ld MB\n", mbcount);
     uint8_t *p;
     p = (uint8_t *)my_malloc(mbcount * 1024ULL * 1024ULL);
 
diff --git a/microbench/st.cpp b/microbench/st.cpp
new file mode 100644
index 0000000..e798086
--- /dev/null
+++ b/microbench/st.cpp
@@ -0,0 +1,119 @@
+/*
+ * Microbench testies for MLP and memory latency in CXLMS
+ *
+ *  By: Andrew Quinn
+ *      Yiwei Yang
+ *
+ *  Copyright 2023 Regents of the Univeristy of California
+ *  UC Santa Cruz Sluglab.
+ */
+
+
+#include <errno.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <cpuid.h>
+#include <pthread.h>
+#include <stdlib.h>
+
+#include <sys/mman.h>
+
+
+#define STR_HELPER(x) #x
+#define STR(x) STR_HELPER(x)
+
+#define MOVE_SIZE 128
+#define MAP_SIZE  (long)(1024 * 1024 * 1024)
+#define CACHELINE_SIZE  64
+
+#ifndef FENCE_COUNT
+#define FENCE_COUNT 8
+#endif
+
+#define FENCE_BOUND (FENCE_COUNT * MOVE_SIZE)
+
+// we need to jump in MOVE_SIZE increments otherwise segfault!
+
+#define BODY(start)						\
+  "xor %%r8, %%r8 \n"						\
+  "pxor %%xmm1, %%xmm1 \n"					\
+  "LOOP_START%=: \n"						\
+  "lea (%[" #start "], %%r8), %%r9 \n"				\
+  "movdqa  %%xmm1, (%%r9) \n"					\
+  "add $" STR(MOVE_SIZE) ", %%r8 \n"				\
+  "cmp $" STR(FENCE_BOUND) ",%%r8\n"				\
+  "jl LOOP_START%= \n"						\
+  "mfence \n"
+
+
+int main(int argc, char **argv) {
+
+  // in principle, you would want to clear out cache lines (and the
+  // pipeline) before doing any of the inline assembly stuff.  But,
+  // that's hard.  And, its probably noise when you execute over
+  // enough things.
+
+
+  // allocate some meomery
+  char *base =(char *) mmap(nullptr,
+		    MAP_SIZE,
+		    PROT_READ | PROT_WRITE,
+		    MAP_ANONYMOUS | MAP_PRIVATE,
+		    -1,
+		    0);
+
+  if (base == MAP_FAILED) {
+    fprintf(stderr, "oops, you suck %d\n", errno);
+    return -1;
+  }
+  char *addr = NULL;
+
+  intptr_t *iaddr = (intptr_t*) base;
+  intptr_t hash = 0;
+  struct timespec tstart = {0,0}, tend = {0,0};
+
+  // Necessary so that we don't include allocation costs in our benchmark
+  while (iaddr < (intptr_t *)(base + MAP_SIZE)) {
+    hash = hash ^ (intptr_t) iaddr;
+    *iaddr = hash;
+    iaddr++;
+  }
+
+  // should flush everything from the cache. But, how big is the cache?
+  addr = base;
+  while (addr < (base + MAP_SIZE)) {
+    asm volatile(
+		 "mov %[buf], %%rsi\n"
+		 "clflush (%%rsi)\n"
+		 :
+		 : [buf] "r" (addr)
+		 : "rsi");
+    addr += CACHELINE_SIZE;
+  }
+
+  asm volatile ("mfence\n" :::);
+
+  clock_gettime(CLOCK_MONOTONIC, &tstart);
+  addr = base;
+  while (addr < (base + MAP_SIZE)) {
+    //fprintf (stderr, "addr %p bound %p\n", addr, base + MAP_SIZE);
+    asm volatile(
+		 BODY(addr)
+		 :
+		 : [addr] "r" (addr)
+		 : "r8", "r9", "xmm0");
+
+      addr += (FENCE_COUNT * MOVE_SIZE);
+  }
+  clock_gettime(CLOCK_MONOTONIC, &tend);
+  uint64_t nanos = (1000000000  * tend.tv_sec + tend.tv_nsec);
+  nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec);
+
+
+  printf("%lu\n", nanos);
+  return 0;
+}
+
diff --git a/microbench/uarch.h b/microbench/uarch.h
new file mode 100644
index 0000000..6b45b7d
--- /dev/null
+++ b/microbench/uarch.h
@@ -0,0 +1,1036 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+uint32_t *lfs_random_array;
+#define KERNEL_BEGIN                                                                                                   \
+    do {                                                                                                               \
+    } while (0);
+#define KERNEL_END                                                                                                     \
+    do {                                                                                                               \
+    } while (0);
+#define CACHELINE_SIZE 64
+
+#define SIZEBTNT_64_AVX512                                                                                             \
+    "vmovntdq  %%zmm0,  0x0(%%r9, %%r10) \n"                                                                           \
+    "add $0x40, %%r10 \n"
+
+#define SIZEBTNT_128_AVX512                                                                                            \
+    "vmovntdq  %%zmm0,  0x0(%%r9, %%r10) \n"                                                                           \
+    "vmovntdq  %%zmm0,  0x40(%%r9, %%r10) \n"                                                                          \
+    "add $0x80, %%r10 \n"
+
+#define SIZEBTNT_256_AVX512                                                                                            \
+    "vmovntdq  %%zmm0,  0x0(%%r9, %%r10) \n"                                                                           \
+    "vmovntdq  %%zmm0,  0x40(%%r9, %%r10) \n"                                                                          \
+    "vmovntdq  %%zmm0,  0x80(%%r9, %%r10) \n"                                                                          \
+    "vmovntdq  %%zmm0,  0xc0(%%r9, %%r10) \n"                                                                          \
+    "add $0x100, %%r10 \n"
+
+#define SIZEBTNT_512_AVX512                                                                                            \
+    "vmovntdq  %%zmm0,  0x0(%%r9, %%r10) \n"                                                                           \
+    "vmovntdq  %%zmm0,  0x40(%%r9, %%r10) \n"                                                                          \
+    "vmovntdq  %%zmm0,  0x80(%%r9, %%r10) \n"                                                                          \
+    "vmovntdq  %%zmm0,  0xc0(%%r9, %%r10) \n"                                                                          \
+    "vmovntdq  %%zmm0,  0x100(%%r9, %%r10) \n"                                                                         \
+    "vmovntdq  %%zmm0,  0x140(%%r9, %%r10) \n"                                                                         \
+    "vmovntdq  %%zmm0,  0x180(%%r9, %%r10) \n"                                                                         \
+    "vmovntdq  %%zmm0,  0x1c0(%%r9, %%r10) \n"                                                                         \
+    "add $0x200, %%r10 \n"
+
+#define SIZEBTNT_1024_AVX512                                                                                           \
+    "vmovntdq  %%zmm0,  0x0(%%r9, %%r10) \n"                                                                           \
+    "vmovntdq  %%zmm0,  0x40(%%r9, %%r10) \n"                                                                          \
+    "vmovntdq  %%zmm0,  0x80(%%r9, %%r10) \n"                                                                          \
+    "vmovntdq  %%zmm0,  0xc0(%%r9, %%r10) \n"                                                                          \
+    "vmovntdq  %%zmm0,  0x100(%%r9, %%r10) \n"                                                                         \
+    "vmovntdq  %%zmm0,  0x140(%%r9, %%r10) \n"                                                                         \
+    "vmovntdq  %%zmm0,  0x180(%%r9, %%r10) \n"                                                                         \
+    "vmovntdq  %%zmm0,  0x1c0(%%r9, %%r10) \n"                                                                         \
+    "vmovntdq  %%zmm0,  0x200(%%r9, %%r10) \n"                                                                         \
+    "vmovntdq  %%zmm0,  0x240(%%r9, %%r10) \n"                                                                         \
+    "vmovntdq  %%zmm0,  0x280(%%r9, %%r10) \n"                                                                         \
+    "vmovntdq  %%zmm0,  0x2c0(%%r9, %%r10) \n"                                                                         \
+    "vmovntdq  %%zmm0,  0x300(%%r9, %%r10) \n"                                                                         \
+    "vmovntdq  %%zmm0,  0x340(%%r9, %%r10) \n"                                                                         \
+    "vmovntdq  %%zmm0,  0x380(%%r9, %%r10) \n"                                                                         \
+    "vmovntdq  %%zmm0,  0x3c0(%%r9, %%r10) \n"                                                                         \
+    "add $0x400, %%r10 \n"
+
+#define SIZEBTSTFLUSH_64_AVX512                                                                                        \
+    "vmovdqa64  %%zmm0,  0x0(%%r9, %%r10) \n"                                                                          \
+    "clwb  0x0(%%r9, %%r10) \n"                                                                                        \
+    "add $0x40, %%r10 \n"
+
+#define SIZEBTSTFLUSH_128_AVX512                                                                                       \
+    "vmovdqa64  %%zmm0,  0x0(%%r9, %%r10) \n"                                                                          \
+    "clwb  0x0(%%r9, %%r10) \n"                                                                                        \
+    "vmovdqa64  %%zmm0,  0x40(%%r9, %%r10) \n"                                                                         \
+    "clwb  0x40(%%r9, %%r10) \n"                                                                                       \
+    "add $0x80, %%r10 \n"
+
+#define SIZEBTSTFLUSH_256_AVX512                                                                                       \
+    "vmovdqa64  %%zmm0,  0x0(%%r9, %%r10) \n"                                                                          \
+    "clwb  0x0(%%r9, %%r10) \n"                                                                                        \
+    "vmovdqa64  %%zmm0,  0x40(%%r9, %%r10) \n"                                                                         \
+    "clwb  0x40(%%r9, %%r10) \n"                                                                                       \
+    "vmovdqa64  %%zmm0,  0x80(%%r9, %%r10) \n"                                                                         \
+    "clwb  0x80(%%r9, %%r10) \n"                                                                                       \
+    "vmovdqa64  %%zmm0,  0xc0(%%r9, %%r10) \n"                                                                         \
+    "clwb  0xc0(%%r9, %%r10) \n"                                                                                       \
+    "add $0x100, %%r10 \n"
+
+#define SIZEBTSTFLUSH_512_AVX512                                                                                       \
+    "vmovdqa64  %%zmm0,  0x0(%%r9, %%r10) \n"                                                                          \
+    "clwb  0x0(%%r9, %%r10) \n"                                                                                        \
+    "vmovdqa64  %%zmm0,  0x40(%%r9, %%r10) \n"                                                                         \
+    "clwb  0x40(%%r9, %%r10) \n"                                                                                       \
+    "vmovdqa64  %%zmm0,  0x80(%%r9, %%r10) \n"                                                                         \
+    "clwb  0x80(%%r9, %%r10) \n"                                                                                       \
+    "vmovdqa64  %%zmm0,  0xc0(%%r9, %%r10) \n"                                                                         \
+    "clwb  0xc0(%%r9, %%r10) \n"                                                                                       \
+    "vmovdqa64  %%zmm0,  0x100(%%r9, %%r10) \n"                                                                        \
+    "clwb  0x100(%%r9, %%r10) \n"                                                                                      \
+    "vmovdqa64  %%zmm0,  0x140(%%r9, %%r10) \n"                                                                        \
+    "clwb  0x140(%%r9, %%r10) \n"                                                                                      \
+    "vmovdqa64  %%zmm0,  0x180(%%r9, %%r10) \n"                                                                        \
+    "clwb  0x180(%%r9, %%r10) \n"                                                                                      \
+    "vmovdqa64  %%zmm0,  0x1c0(%%r9, %%r10) \n"                                                                        \
+    "clwb  0x1c0(%%r9, %%r10) \n"                                                                                      \
+    "add $0x200, %%r10 \n"
+
+#define SIZEBTSTFLUSH_1024_AVX512                                                                                      \
+    "vmovdqa64  %%zmm0,  0x0(%%r9, %%r10) \n"                                                                          \
+    "clwb  0x0(%%r9, %%r10) \n"                                                                                        \
+    "vmovdqa64  %%zmm0,  0x40(%%r9, %%r10) \n"                                                                         \
+    "clwb  0x40(%%r9, %%r10) \n"                                                                                       \
+    "vmovdqa64  %%zmm0,  0x80(%%r9, %%r10) \n"                                                                         \
+    "clwb  0x80(%%r9, %%r10) \n"                                                                                       \
+    "vmovdqa64  %%zmm0,  0xc0(%%r9, %%r10) \n"                                                                         \
+    "clwb  0xc0(%%r9, %%r10) \n"                                                                                       \
+    "vmovdqa64  %%zmm0,  0x100(%%r9, %%r10) \n"                                                                        \
+    "clwb  0x100(%%r9, %%r10) \n"                                                                                      \
+    "vmovdqa64  %%zmm0,  0x140(%%r9, %%r10) \n"                                                                        \
+    "clwb  0x140(%%r9, %%r10) \n"                                                                                      \
+    "vmovdqa64  %%zmm0,  0x180(%%r9, %%r10) \n"                                                                        \
+    "clwb  0x180(%%r9, %%r10) \n"                                                                                      \
+    "vmovdqa64  %%zmm0,  0x1c0(%%r9, %%r10) \n"                                                                        \
+    "clwb  0x1c0(%%r9, %%r10) \n"                                                                                      \
+    "vmovdqa64  %%zmm0,  0x200(%%r9, %%r10) \n"                                                                        \
+    "clwb  0x200(%%r9, %%r10) \n"                                                                                      \
+    "vmovdqa64  %%zmm0,  0x240(%%r9, %%r10) \n"                                                                        \
+    "clwb  0x240(%%r9, %%r10) \n"                                                                                      \
+    "vmovdqa64  %%zmm0,  0x280(%%r9, %%r10) \n"                                                                        \
+    "clwb  0x280(%%r9, %%r10) \n"                                                                                      \
+    "vmovdqa64  %%zmm0,  0x2c0(%%r9, %%r10) \n"                                                                        \
+    "clwb  0x2c0(%%r9, %%r10) \n"                                                                                      \
+    "vmovdqa64  %%zmm0,  0x300(%%r9, %%r10) \n"                                                                        \
+    "clwb  0x300(%%r9, %%r10) \n"                                                                                      \
+    "vmovdqa64  %%zmm0,  0x340(%%r9, %%r10) \n"                                                                        \
+    "clwb  0x340(%%r9, %%r10) \n"                                                                                      \
+    "vmovdqa64  %%zmm0,  0x380(%%r9, %%r10) \n"                                                                        \
+    "clwb  0x380(%%r9, %%r10) \n"                                                                                      \
+    "vmovdqa64  %%zmm0,  0x3c0(%%r9, %%r10) \n"                                                                        \
+    "clwb  0x3c0(%%r9, %%r10) \n"                                                                                      \
+    "add $0x400, %%r10 \n"
+
+#define SIZEBTST_64_AVX512                                                                                             \
+    "vmovdqa64  %%zmm0,  0x0(%%r9, %%r10) \n"                                                                          \
+    "add $0x40, %%r10 \n"
+
+#define SIZEBTST_128_AVX512                                                                                            \
+    "vmovdqa64  %%zmm0,  0x0(%%r9, %%r10) \n"                                                                          \
+    "vmovdqa64  %%zmm0,  0x40(%%r9, %%r10) \n"                                                                         \
+    "add $0x80, %%r10 \n"
+
+#define SIZEBTST_256_AVX512                                                                                            \
+    "vmovdqa64  %%zmm0,  0x0(%%r9, %%r10) \n"                                                                          \
+    "vmovdqa64  %%zmm0,  0x40(%%r9, %%r10) \n"                                                                         \
+    "vmovdqa64  %%zmm0,  0x80(%%r9, %%r10) \n"                                                                         \
+    "vmovdqa64  %%zmm0,  0xc0(%%r9, %%r10) \n"                                                                         \
+    "add $0x100, %%r10 \n"
+
+#define SIZEBTST_512_AVX512                                                                                            \
+    "vmovdqa64  %%zmm0,  0x0(%%r9, %%r10) \n"                                                                          \
+    "vmovdqa64  %%zmm0,  0x40(%%r9, %%r10) \n"                                                                         \
+    "vmovdqa64  %%zmm0,  0x80(%%r9, %%r10) \n"                                                                         \
+    "vmovdqa64  %%zmm0,  0xc0(%%r9, %%r10) \n"                                                                         \
+    "vmovdqa64  %%zmm0,  0x100(%%r9, %%r10) \n"                                                                        \
+    "vmovdqa64  %%zmm0,  0x140(%%r9, %%r10) \n"                                                                        \
+    "vmovdqa64  %%zmm0,  0x180(%%r9, %%r10) \n"                                                                        \
+    "vmovdqa64  %%zmm0,  0x1c0(%%r9, %%r10) \n"                                                                        \
+    "add $0x200, %%r10 \n"
+
+#define SIZEBTST_1024_AVX512                                                                                           \
+    "vmovdqa64  %%zmm0,  0x0(%%r9, %%r10) \n"                                                                          \
+    "vmovdqa64  %%zmm0,  0x40(%%r9, %%r10) \n"                                                                         \
+    "vmovdqa64  %%zmm0,  0x80(%%r9, %%r10) \n"                                                                         \
+    "vmovdqa64  %%zmm0,  0xc0(%%r9, %%r10) \n"                                                                         \
+    "vmovdqa64  %%zmm0,  0x100(%%r9, %%r10) \n"                                                                        \
+    "vmovdqa64  %%zmm0,  0x140(%%r9, %%r10) \n"                                                                        \
+    "vmovdqa64  %%zmm0,  0x180(%%r9, %%r10) \n"                                                                        \
+    "vmovdqa64  %%zmm0,  0x1c0(%%r9, %%r10) \n"                                                                        \
+    "vmovdqa64  %%zmm0,  0x200(%%r9, %%r10) \n"                                                                        \
+    "vmovdqa64  %%zmm0,  0x240(%%r9, %%r10) \n"                                                                        \
+    "vmovdqa64  %%zmm0,  0x280(%%r9, %%r10) \n"                                                                        \
+    "vmovdqa64  %%zmm0,  0x2c0(%%r9, %%r10) \n"                                                                        \
+    "vmovdqa64  %%zmm0,  0x300(%%r9, %%r10) \n"                                                                        \
+    "vmovdqa64  %%zmm0,  0x340(%%r9, %%r10) \n"                                                                        \
+    "vmovdqa64  %%zmm0,  0x380(%%r9, %%r10) \n"                                                                        \
+    "vmovdqa64  %%zmm0,  0x3c0(%%r9, %%r10) \n"                                                                        \
+    "add $0x400, %%r10 \n"
+
+#define SIZEBTLD_64_AVX512                                                                                             \
+    "vmovntdqa 0x0(%%r9, %%r10), %%zmm0 \n"                                                                            \
+    "add $0x40, %%r10 \n"
+
+#define SIZEBTLD_128_AVX512                                                                                            \
+    "vmovntdqa  0x0(%%r9, %%r10), %%zmm0 \n"                                                                           \
+    "vmovntdqa  0x40(%%r9, %%r10), %%zmm1 \n"                                                                          \
+    "add $0x80, %%r10 \n"
+
+#define SIZEBTLD_256_AVX512                                                                                            \
+    "vmovntdqa  0x0(%%r9, %%r10), %%zmm0 \n"                                                                           \
+    "vmovntdqa  0x40(%%r9, %%r10), %%zmm1 \n"                                                                          \
+    "vmovntdqa  0x80(%%r9, %%r10), %%zmm2 \n"                                                                          \
+    "vmovntdqa  0xc0(%%r9, %%r10), %%zmm3 \n"                                                                          \
+    "add $0x100, %%r10 \n"
+
+#define SIZEBTLD_512_AVX512                                                                                            \
+    "vmovntdqa  0x0(%%r9, %%r10), %%zmm0 \n"                                                                           \
+    "vmovntdqa  0x40(%%r9, %%r10), %%zmm1 \n"                                                                          \
+    "vmovntdqa  0x80(%%r9, %%r10), %%zmm2 \n"                                                                          \
+    "vmovntdqa  0xc0(%%r9, %%r10), %%zmm3 \n"                                                                          \
+    "vmovntdqa  0x100(%%r9, %%r10), %%zmm4 \n"                                                                         \
+    "vmovntdqa  0x140(%%r9, %%r10), %%zmm5 \n"                                                                         \
+    "vmovntdqa  0x180(%%r9, %%r10), %%zmm6 \n"                                                                         \
+    "vmovntdqa  0x1c0(%%r9, %%r10), %%zmm7 \n"                                                                         \
+    "add $0x200, %%r10 \n"
+
+#define SIZEBTLD_1024_AVX512                                                                                           \
+    "vmovntdqa  0x0(%%r9, %%r10), %%zmm0 \n"                                                                           \
+    "vmovntdqa  0x40(%%r9, %%r10), %%zmm1 \n"                                                                          \
+    "vmovntdqa  0x80(%%r9, %%r10), %%zmm2 \n"                                                                          \
+    "vmovntdqa  0xc0(%%r9, %%r10), %%zmm3 \n"                                                                          \
+    "vmovntdqa  0x100(%%r9, %%r10), %%zmm4 \n"                                                                         \
+    "vmovntdqa  0x140(%%r9, %%r10), %%zmm5 \n"                                                                         \
+    "vmovntdqa  0x180(%%r9, %%r10), %%zmm6 \n"                                                                         \
+    "vmovntdqa  0x1c0(%%r9, %%r10), %%zmm7 \n"                                                                         \
+    "vmovntdqa  0x200(%%r9, %%r10), %%zmm8 \n"                                                                         \
+    "vmovntdqa  0x240(%%r9, %%r10), %%zmm9 \n"                                                                         \
+    "vmovntdqa  0x280(%%r9, %%r10), %%zmm10 \n"                                                                        \
+    "vmovntdqa  0x2c0(%%r9, %%r10), %%zmm11 \n"                                                                        \
+    "vmovntdqa  0x300(%%r9, %%r10), %%zmm12 \n"                                                                        \
+    "vmovntdqa  0x340(%%r9, %%r10), %%zmm13 \n"                                                                        \
+    "vmovntdqa  0x380(%%r9, %%r10), %%zmm14 \n"                                                                        \
+    "vmovntdqa  0x3c0(%%r9, %%r10), %%zmm15 \n"                                                                        \
+    "add $0x400, %%r10 \n"
+
+#define SIZEBT_NT_64                                                                                                   \
+    "movnti %[random], 0x0(%%r9, %%r10) \n"                                                                            \
+    "movnti %[random], 0x8(%%r9, %%r10) \n"                                                                            \
+    "movnti %[random], 0x10(%%r9, %%r10) \n"                                                                           \
+    "movnti %[random], 0x18(%%r9, %%r10) \n"                                                                           \
+    "movnti %[random], 0x20(%%r9, %%r10) \n"                                                                           \
+    "movnti %[random], 0x28(%%r9, %%r10) \n"                                                                           \
+    "movnti %[random], 0x30(%%r9, %%r10) \n"                                                                           \
+    "movnti %[random], 0x38(%%r9, %%r10) \n"                                                                           \
+    "add $0x40, %%r10 \n"
+
+#define SIZEBT_LOAD_64                                                                                                 \
+    "mov 0x0(%%r9, %%r10),  %%r13  \n"                                                                                 \
+    "mov 0x8(%%r9, %%r10),  %%r13  \n"                                                                                 \
+    "mov 0x10(%%r9, %%r10), %%r13  \n"                                                                                 \
+    "mov 0x18(%%r9, %%r10), %%r13  \n"                                                                                 \
+    "mov 0x20(%%r9, %%r10), %%r13  \n"                                                                                 \
+    "mov 0x28(%%r9, %%r10), %%r13  \n"                                                                                 \
+    "mov 0x30(%%r9, %%r10), %%r13  \n"                                                                                 \
+    "mov 0x38(%%r9, %%r10), %%r13  \n"
+
+/* Arbitrary sizes w/o clearing pipeline */
+
+#define SIZEBTNT_MACRO SIZEBTNT_512_AVX512
+#define SIZEBTST_MACRO SIZEBTST_512_AVX512
+#define SIZEBTLD_MACRO SIZEBT_LOAD_64
+#define SIZEBTSTFLUSH_MACRO SIZEBTSTFLUSH_512_AVX512
+
+// #define SIZEBTST_FENCE	"mfence \n"
+// #define SIZEBTLD_FENCE	"mfence \n"
+#define SIZEBTST_FENCE ""
+#define SIZEBTLD_FENCE ""
+
+#define CACHEFENCE_FENCE "sfence \n"
+// #define CACHEFENCE_FENCE	"mfence \n"
+
+#define RandLFSR64_NEW(rand, accessmask, addr)				\
+  "mov    (%[" #rand "]), %%r9 \n"					\
+  "mov    %%r9, %%r12 \n"						\
+  "shr    %%r9 \n"							\
+  "and    $0x1, %%r12d \n"						\
+  "neg    %%r12 \n"							\
+  "and    %%rcx, %%r12 \n"						\
+  "xor    %%r9, %%r12 \n"						\
+  "mov    %%r12, (%[" #rand "]) \n"					\
+  "mov    %%r12, %%r8 \n"						\
+  "and    %[" #accessmask "], %%r8 \n"					\
+  "lea (%[" #addr "], %%r8), %%r9 \n"					
+
+#define RandLFSR64							\
+  "mov    (%[random]), %%r9 \n"						\
+  "mov    %%r9, %%r12 \n"						\
+  "shr    %%r9 \n"							\
+  "and    $0x1, %%r12d \n"						\
+  "neg    %%r12 \n"							\
+  "and    %%rcx, %%r12 \n"						\
+  "xor    %%r9, %%r12 \n"						\
+  "mov    %%r12, (%[random]) \n"					\
+  "mov    %%r12, %%r8 \n"						\
+  "and    %[accessmask], %%r8 \n"
+
+
+void sizebw_load(char *start_addr, long size, long count, long *rand_seed, long access_mask) {
+    KERNEL_BEGIN
+    asm volatile("movabs $0xd800000000000000, %%rcx \n" /* rcx: bitmask used in LFSR */
+                 "xor %%r8, %%r8 \n" /* r8: access offset */
+                 "xor %%r11, %%r11 \n" /* r11: access counter */
+                 // 1
+                 "LOOP_FRNG_SIZEBWL_RLOOP: \n" /* outer (counter) loop */
+                 RandLFSR64 /* LFSR: uses r9, r12 (reusable), rcx (above), fill r8 */
+                 "lea (%[start_addr], %%r8), %%r9 \n"
+                 "xor %%r10, %%r10 \n" /* r10: accessed size */
+                 "LOOP_FRNG_SIZEBWL_ONE1: \n" /* inner (access) loop, unroll 8 times */
+                 SIZEBTLD_MACRO /* Access: uses r8[rand_base], r10[size_accessed], r9 */
+                 "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_FRNG_SIZEBWL_ONE1 \n" SIZEBTLD_FENCE
+
+                     // 2
+                     RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n"
+                 "xor %%r10, %%r10 \n"
+                 "LOOP_FRNG_SIZEBWL_ONE2: \n" SIZEBTLD_MACRO "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_FRNG_SIZEBWL_ONE2 \n" SIZEBTLD_FENCE
+                     // 3
+                     RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n"
+                 "xor %%r10, %%r10 \n"
+                 "LOOP_FRNG_SIZEBWL_ONE3: \n" SIZEBTLD_MACRO "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_FRNG_SIZEBWL_ONE3 \n" SIZEBTLD_FENCE
+                     // 4
+                     RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n"
+                 "xor %%r10, %%r10 \n"
+                 "LOOP_FRNG_SIZEBWL_ONE4: \n" SIZEBTLD_MACRO "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_FRNG_SIZEBWL_ONE4 \n" SIZEBTLD_FENCE
+
+                 "add $4, %%r11 \n"
+                 "cmp %[count], %%r11\n"
+                 "jl LOOP_FRNG_SIZEBWL_RLOOP \n"
+
+                 : [random] "=r"(rand_seed)
+                 : [start_addr] "r"(start_addr), [accesssize] "r"(size), [count] "r"(count),
+                   "0"(rand_seed), [accessmask] "r"(access_mask)
+                 : "%rcx", "%r12", "%r11", "%r10", "%r9", "%r8");
+    KERNEL_END
+}
+
+void sizebw_load_new(char *start_addr, long count, long *rand_seed, uint64_t access_mask) {
+    KERNEL_BEGIN
+    asm volatile("movabs $0xd800000000000000, %%rcx \n" /* rcx: bitmask used in LFSR */
+                 "xor %%r8, %%r8 \n" /* r8: access offset */
+                 "xor %%r11, %%r11 \n" /* r11: access counter */
+                 // 1
+                 "LD_LOOP_NEW: \n" 
+                 RandLFSR64 /* LFSR: uses r9, r12 (reusable), rcx (above), fill r8 */
+                 "lea (%[start_addr], %%r8), %%r9 \n"
+		 "mov 0x0(%%r9), %%r13 \n"
+                 "add $1, %%r11 \n"
+                 "cmp %[count], %%r11\n"
+                 "jl LD_LOOP_NEW \n"
+                 : [random] "=r"(rand_seed)
+                 : [start_addr] "r"(start_addr), [count] "r"(count),
+		 "0"(rand_seed), [accessmask] "r"(access_mask)
+		 : "%rcx", "%r13", "%r12", "%r11", "%r10", "%r9", "%r8");
+    KERNEL_END
+}
+
+
+#define OPERATION
+
+
+#define RANDOM_OPER(rand, mask, buf)		\
+  RandLFSR64_NEW(rand, mask, buf)		\
+    OPERATION							
+
+
+#define UNROLL_4(rand, mask, buf)			\
+  RANDOM_OPER(rand, mask, buf)				\
+    RANDOM_OPER(rand, mask, buf)			\
+    RANDOM_OPER(rand, mask, buf)			\
+    RANDOM_OPER(rand, mask, buf)
+
+#define UNROLL_16(rand, mask, buf)		\
+  UNROLL_4(rand, mask, buf)			\
+    UNROLL_4(rand, mask, buf)			\
+    UNROLL_4(rand, mask, buf)			\
+    UNROLL_4(rand, mask, buf)			
+
+
+#define LOAD_NEW(start_addr, rand_seed, access_mask)			\
+  do {									\
+    /*r8: rand number, r9: computed addr, r13: dest, r12: temp in lfsr, */ \
+    /*rcx: bitmask for lfsr */						\
+    asm volatile("movabs $0xd800000000000000, %%rcx \n" /*  bitmask for LFSR */ \
+		 "xor %%r8, %%r8 \n" /* r8: access offset */		\
+		 UNROLL_16(random, accessmask, buf)			\
+		 : [random] "=r"(rand_seed)				\
+		 : [buf] "r"(start_addr), "0"(rand_seed), [accessmask] "r"(access_mask) \
+		 : "%rcx", "%r13", "%r12", "%r9", "%r8"); \
+  } while(0);
+    
+
+void sizebw_nt(char *start_addr, long size, long count, long *rand_seed, long access_mask) {
+    KERNEL_BEGIN
+    asm volatile("movabs $0xd800000000000000, %%rcx \n"
+                 "xor %%r11, %%r11 \n"
+                 "movq %[start_addr], %%xmm0 \n" /* zmm0: read/write register */
+                 // 1
+                 "LOOP_FRNG_SIZEBWNT_RLOOP: \n" RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n"
+                 "xor %%r10, %%r10 \n"
+                 "LOOP_FRNG_SIZEBWNT_ONE1: \n" SIZEBTNT_MACRO "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_FRNG_SIZEBWNT_ONE1 \n" SIZEBTST_FENCE
+
+                     // 2
+                     RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n"
+                 "xor %%r10, %%r10 \n"
+                 "LOOP_FRNG_SIZEBWNT_ONE2: \n" SIZEBTNT_MACRO "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_FRNG_SIZEBWNT_ONE2 \n" SIZEBTST_FENCE
+                     // 3
+                     RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n"
+                 "xor %%r10, %%r10 \n"
+                 "LOOP_FRNG_SIZEBWNT_ONE3: \n" SIZEBTNT_MACRO "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_FRNG_SIZEBWNT_ONE3 \n" SIZEBTST_FENCE
+                     // 4
+                     RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n"
+                 "xor %%r10, %%r10 \n"
+                 "LOOP_FRNG_SIZEBWNT_ONE4: \n" SIZEBTNT_MACRO "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_FRNG_SIZEBWNT_ONE4 \n" SIZEBTST_FENCE
+
+                 "add $4, %%r11 \n"
+                 "cmp %[count], %%r11\n"
+                 "jl LOOP_FRNG_SIZEBWNT_RLOOP \n"
+
+                 : [random] "=r"(rand_seed)
+                 : [start_addr] "r"(start_addr), [accesssize] "r"(size), [count] "r"(count),
+                   "0"(rand_seed), [accessmask] "r"(access_mask)
+                 : "%rcx", "%r12", "%r11", "%r10", "%r9", "%r8");
+    KERNEL_END
+}
+
+void sizebw_store(char *start_addr, long size, long count, long *rand_seed, long access_mask) {
+    KERNEL_BEGIN
+    asm volatile("movabs $0xd800000000000000, %%rcx \n"
+                 "xor %%r11, %%r11 \n"
+                 "movq %[start_addr], %%xmm0 \n" /* zmm0: read/write register */
+                 // 1
+                 "LOOP_FRNG_SIZEBWST_RLOOP: \n" RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n"
+                 "xor %%r10, %%r10 \n"
+                 "LOOP_FRNG_SIZEBWST_ONE1: \n" SIZEBTST_MACRO "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_FRNG_SIZEBWST_ONE1 \n" SIZEBTST_FENCE
+
+                     // 2
+                     RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n"
+                 "xor %%r10, %%r10 \n"
+                 "LOOP_FRNG_SIZEBWST_ONE2: \n" SIZEBTST_MACRO "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_FRNG_SIZEBWST_ONE2 \n" SIZEBTST_FENCE
+                     // 3
+                     RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n"
+                 "xor %%r10, %%r10 \n"
+                 "LOOP_FRNG_SIZEBWST_ONE3: \n" SIZEBTST_MACRO "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_FRNG_SIZEBWST_ONE3 \n" SIZEBTST_FENCE
+                     // 4
+                     RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n"
+                 "xor %%r10, %%r10 \n"
+                 "LOOP_FRNG_SIZEBWST_ONE4: \n" SIZEBTST_MACRO "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_FRNG_SIZEBWST_ONE4 \n" SIZEBTST_FENCE
+
+                 "add $4, %%r11 \n"
+                 "cmp %[count], %%r11\n"
+                 "jl LOOP_FRNG_SIZEBWST_RLOOP \n"
+
+                 : [random] "=r"(rand_seed)
+                 : [start_addr] "r"(start_addr), [accesssize] "r"(size), [count] "r"(count),
+                   "0"(rand_seed), [accessmask] "r"(access_mask)
+                 : "%rcx", "%r12", "%r11", "%r10", "%r9", "%r8");
+    KERNEL_END
+}
+
+void sizebw_storeclwb(char *start_addr, long size, long count, long *rand_seed, long access_mask) {
+    KERNEL_BEGIN
+    asm volatile("movabs $0xd800000000000000, %%rcx \n"
+                 "xor %%r11, %%r11 \n"
+                 "movq %[start_addr], %%xmm0 \n" /* zmm0: read/write register */
+                 // 1
+                 "LOOP_FRNG_SIZEBWSTFLUSH_RLOOP: \n" RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n"
+                 "xor %%r10, %%r10 \n"
+                 "LOOP_FRNG_SIZEBWSTFLUSH_ONE1: \n" SIZEBTSTFLUSH_MACRO "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_FRNG_SIZEBWSTFLUSH_ONE1 \n" SIZEBTST_FENCE
+
+                     // 2
+                     RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n"
+                 "xor %%r10, %%r10 \n"
+                 "LOOP_FRNG_SIZEBWSTFLUSH_ONE2: \n" SIZEBTSTFLUSH_MACRO "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_FRNG_SIZEBWSTFLUSH_ONE2 \n" SIZEBTST_FENCE
+                     // 3
+                     RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n"
+                 "xor %%r10, %%r10 \n"
+                 "LOOP_FRNG_SIZEBWSTFLUSH_ONE3: \n" SIZEBTSTFLUSH_MACRO "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_FRNG_SIZEBWSTFLUSH_ONE3 \n" SIZEBTST_FENCE
+                     // 4
+                     RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n"
+                 "xor %%r10, %%r10 \n"
+                 "LOOP_FRNG_SIZEBWSTFLUSH_ONE4: \n" SIZEBTSTFLUSH_MACRO "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_FRNG_SIZEBWSTFLUSH_ONE4 \n" SIZEBTST_FENCE
+
+                 "add $4, %%r11 \n"
+                 "cmp %[count], %%r11\n"
+                 "jl LOOP_FRNG_SIZEBWSTFLUSH_RLOOP \n"
+
+                 : [random] "=r"(rand_seed)
+                 : [start_addr] "r"(start_addr), [accesssize] "r"(size), [count] "r"(count),
+                   "0"(rand_seed), [accessmask] "r"(access_mask)
+                 : "%rcx", "%r12", "%r11", "%r10", "%r9", "%r8");
+    KERNEL_END
+}
+
+void stride_load(char *start_addr, long size, long skip, long delay, long count) {
+    KERNEL_BEGIN
+    asm volatile("xor %%r8, %%r8 \n" /* r8: access offset */
+                 "xor %%r11, %%r11 \n" /* r11: counter */
+
+                 // 1
+                 "LOOP_STRIDELOAD_OUTER: \n" /* outer (counter) loop */
+                 "lea (%[start_addr], %%r8), %%r9 \n" /* r9: access loc */
+                 "xor %%r10, %%r10 \n" /* r10: accessed size */
+                 "LOOP_STRIDELOAD_INNER: \n" /* inner (access) loop, unroll 8 times */
+                 SIZEBTLD_64_AVX512 /* Access: uses r10[size_accessed], r9 */
+                 "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_STRIDELOAD_INNER \n" SIZEBTLD_FENCE
+
+                 "xor %%r10, %%r10 \n"
+                 "LOOP_STRIDELOAD_DELAY: \n" /* delay <delay> cycles */
+                 "inc %%r10 \n"
+                 "cmp %[delay], %%r10 \n"
+                 "jl LOOP_STRIDELOAD_DELAY \n"
+
+                 "add %[skip], %%r8 \n"
+                 "inc %%r11 \n"
+                 "cmp %[count], %%r11 \n"
+
+                 "jl LOOP_STRIDELOAD_OUTER \n"
+
+                 ::[start_addr] "r"(start_addr),
+                 [accesssize] "r"(size), [count] "r"(count), [skip] "r"(skip), [delay] "r"(delay)
+                 : "%r11", "%r10", "%r9", "%r8");
+    KERNEL_END
+}
+
+void stride_nt(char *start_addr, long size, long skip, long delay, long count) {
+    KERNEL_BEGIN
+    asm volatile("xor %%r8, %%r8 \n" /* r8: access offset */
+                 "xor %%r11, %%r11 \n" /* r11: counter */
+                 "movq %[start_addr], %%xmm0 \n" /* zmm0: read/write register */
+                 // 1
+                 "LOOP_STRIDENT_OUTER: \n" /* outer (counter) loop */
+                 "lea (%[start_addr], %%r8), %%r9 \n" /* r9: access loc */
+                 "xor %%r10, %%r10 \n" /* r10: accessed size */
+                 "LOOP_STRIDENT_INNER: \n" /* inner (access) loop, unroll 8 times */
+                 SIZEBTNT_64_AVX512 /* Access: uses r10[size_accessed], r9 */
+                 "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_STRIDENT_INNER \n" SIZEBTLD_FENCE
+
+                 "xor %%r10, %%r10 \n"
+                 "LOOP_STRIDENT_DELAY: \n" /* delay <delay> cycles */
+                 "inc %%r10 \n"
+                 "cmp %[delay], %%r10 \n"
+                 "jl LOOP_STRIDENT_DELAY \n"
+
+                 "add %[skip], %%r8 \n"
+                 "inc %%r11 \n"
+                 "cmp %[count], %%r11 \n"
+
+                 "jl LOOP_STRIDENT_OUTER \n"
+
+                 ::[start_addr] "r"(start_addr),
+                 [accesssize] "r"(size), [count] "r"(count), [skip] "r"(skip), [delay] "r"(delay)
+                 : "%r11", "%r10", "%r9", "%r8");
+    KERNEL_END
+}
+
+void stride_store(char *start_addr, long size, long skip, long delay, long count) {
+    KERNEL_BEGIN
+    asm volatile("xor %%r8, %%r8 \n" /* r8: access offset */
+                 "xor %%r11, %%r11 \n" /* r11: counter */
+                 "movq %[start_addr], %%xmm0 \n" /* zmm0: read/write register */
+                 // 1
+                 "LOOP_STRIDEST_OUTER: \n" /* outer (counter) loop */
+                 "lea (%[start_addr], %%r8), %%r9 \n" /* r9: access loc */
+                 "xor %%r10, %%r10 \n" /* r10: accessed size */
+                 "LOOP_STRIDEST_INNER: \n" /* inner (access) loop, unroll 8 times */
+                 SIZEBTST_64_AVX512 /* Access: uses r10[size_accessed], r9 */
+                 "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_STRIDEST_INNER \n" SIZEBTST_FENCE
+
+                 "xor %%r10, %%r10 \n"
+                 "LOOP_STRIDEST_DELAY: \n" /* delay <delay> cycles */
+                 "inc %%r10 \n"
+                 "cmp %[delay], %%r10 \n"
+                 "jl LOOP_STRIDEST_DELAY \n"
+
+                 "add %[skip], %%r8 \n"
+                 "inc %%r11 \n"
+                 "cmp %[count], %%r11 \n"
+
+                 "jl LOOP_STRIDEST_OUTER \n"
+
+                 ::[start_addr] "r"(start_addr),
+                 [accesssize] "r"(size), [count] "r"(count), [skip] "r"(skip), [delay] "r"(delay)
+                 : "%r11", "%r10", "%r9", "%r8");
+    KERNEL_END
+}
+
+void stride_storeclwb(char *start_addr, long size, long skip, long delay, long count) {
+    KERNEL_BEGIN
+    asm volatile("xor %%r8, %%r8 \n" /* r8: access offset */
+                 "xor %%r11, %%r11 \n" /* r11: counter */
+                 "movq %[start_addr], %%xmm0 \n" /* zmm0: read/write register */
+                 // 1
+                 "LOOP_STRIDESTFLUSH_OUTER: \n" /* outer (counter) loop */
+                 "lea (%[start_addr], %%r8), %%r9 \n" /* r9: access loc */
+                 "xor %%r10, %%r10 \n" /* r10: accessed size */
+                 "LOOP_STRIDESTFLUSH_INNER: \n" /* inner (access) loop, unroll 8 times */
+                 SIZEBTSTFLUSH_64_AVX512 /* Access: uses r10[size_accessed], r9 */
+                 "cmp %[accesssize], %%r10 \n"
+                 "jl LOOP_STRIDESTFLUSH_INNER \n" SIZEBTST_FENCE
+
+                 "xor %%r10, %%r10 \n"
+                 "LOOP_STRIDESTFLUSH_DELAY: \n" /* delay <delay> cycles */
+                 "inc %%r10 \n"
+                 "cmp %[delay], %%r10 \n"
+                 "jl LOOP_STRIDESTFLUSH_DELAY \n"
+
+                 "add %[skip], %%r8 \n"
+                 "inc %%r11 \n"
+                 "cmp %[count], %%r11 \n"
+
+                 "jl LOOP_STRIDESTFLUSH_OUTER \n"
+
+                 ::[start_addr] "r"(start_addr),
+                 [accesssize] "r"(size), [count] "r"(count), [skip] "r"(skip), [delay] "r"(delay)
+                 : "%r11", "%r10", "%r9", "%r8");
+    KERNEL_END
+}
+
+#define RDRAND_MAX_RETRY 32
+
+/*
+ * Generate random number to [rd] within [range], return 0 if success, 1 if fail.
+ */
+void stride_read_after_write(char *start_addr, long size, long skip, long delay, long count) {
+    KERNEL_BEGIN
+    asm volatile("xor	%%r8, %%r8 \n" /* r8: access offset */
+                 "xor	%%r11, %%r11 \n" /* r11: counter */
+                 "movq	%[start_addr], %%xmm0 \n" /* zmm0: read/write register */
+
+                 "LOOP_RAW_OUTER: \n" /* outer (counter) loop */
+                 "lea	(%[start_addr], %%r8), %%r9 \n" /* r9: access loc */
+                 "xor	%%r10, %%r10 \n" /* r10: accessed size */
+                 "LOOP_RAW_STRIDESTCLWB_INNER: \n" /* inner (access) loop, unroll 8 times */
+                 SIZEBTSTFLUSH_64_AVX512 /* Access: uses r10[size_accessed], r9 */
+                 "cmp	%[accesssize], %%r10 \n"
+                 "jl		LOOP_RAW_STRIDESTCLWB_INNER \n"
+                 "mfence \n"
+
+                 "xor	%%r10, %%r10 \n"
+                 "LOOP_RAW_STRIDELDNT_INNER: \n" SIZEBTNT_64_AVX512 "cmp	%[accesssize], %%r10 \n"
+                 "jl		LOOP_RAW_STRIDELDNT_INNER \n"
+                 "mfence \n"
+
+                 "xor	%%r10, %%r10 \n"
+                 "LOOP_RAW_DELAY: \n" /* delay <delay> cycles */
+                 "inc	%%r10 \n"
+                 "cmp	%[delay], %%r10 \n"
+                 "jl		LOOP_RAW_DELAY \n"
+
+                 "add	%[skip], %%r8 \n"
+                 "inc	%%r11 \n"
+                 "cmp	%[count], %%r11 \n"
+
+                 "jl		LOOP_RAW_OUTER \n"
+
+                 ::[start_addr] "r"(start_addr),
+                 [accesssize] "r"(size), [count] "r"(count), [skip] "r"(skip), [delay] "r"(delay)
+                 : "%r11", "%r10", "%r9", "%r8");
+    KERNEL_END
+}
+
+static inline int get_rand(uint64_t *rd, uint64_t range) {
+    uint8_t ok;
+    int i = 0;
+    for (i = 0; i < RDRAND_MAX_RETRY; i++) {
+        asm volatile("rdrand %0; setc %1\n\t" : "=r"(*rd), "=qm"(ok));
+
+        if (ok) {
+            *rd = *rd % range;
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+int init_chasing_index(uint64_t *cindex, uint64_t csize) {
+    uint64_t curr_pos = 0;
+    uint64_t next_pos = 0;
+    uint64_t i = 0;
+    int ret = 0;
+
+    memset(cindex, 0, sizeof(uint64_t) * csize);
+
+    for (i = 0; i < csize - 1; i++) {
+        do {
+            ret = get_rand(&next_pos, csize);
+            if (ret != 0)
+                return 1;
+        } while ((cindex[next_pos] != 0) || (next_pos == curr_pos));
+
+        cindex[curr_pos] = next_pos;
+        curr_pos = next_pos;
+    }
+
+    return 0;
+}
+
+void chasing_storeclwb(char *start_addr, long size, long skip, long count, uint64_t *cindex) {
+    KERNEL_BEGIN
+    asm volatile("xor	%%r8, %%r8 \n" /* r8: access offset */
+                 "xor	%%r11, %%r11 \n" /* r11: counter */
+                 "LOOP_CHASING_STCLWB_OUTER: \n" /* outer (counter) loop */
+                 "lea	(%[start_addr], %%r8), %%r9 \n" /* r9: access loc */
+                 "xor	%%r10, %%r10 \n" /* r10: accessed size */
+                 "xor	%%r12, %%r12 \n" /* r12: chasing index addr */
+                 "LOOP_CHASING_STCLWB_INNER: \n"
+                 "movq	(%[cindex], %%r12, 8), %%xmm0\n"
+                 "shl    $0x06, %%r12\n"
+                 "vmovdqa64	%%zmm0,  0x0(%%r9, %%r12) \n"
+                 "clwb	0x0(%%r9, %%r12) \n"
+                 "add	$0x40, %%r10\n"
+                 "movq	%%xmm0, %%r12\n" /* Update to next chasing element */
+
+                 "cmp	%[accesssize], %%r10 \n"
+                 "jl		LOOP_CHASING_STCLWB_INNER \n" SIZEBTST_FENCE
+
+                 "xor	%%r10, %%r10 \n"
+
+                 "add	%[skip], %%r8 \n"
+                 "inc	%%r11 \n"
+                 "cmp	%[count], %%r11 \n"
+
+                 "jl		LOOP_CHASING_STCLWB_OUTER \n"
+
+                 :
+                 : [start_addr] "r"(start_addr), [accesssize] "r"(size), [count] "r"(count), [skip] "r"(skip),
+                   [cindex] "r"(cindex)
+                 : "%r12", "%r11", "%r10", "%r9", "%r8");
+    KERNEL_END
+}
+
+void chasing_loadnt(char *start_addr, long size, long skip, long count, uint64_t *cindex) {
+    KERNEL_BEGIN
+    asm volatile("xor    %%r8, %%r8 \n" /* r8: access offset */
+                 "xor    %%r11, %%r11 \n" /* r11: counter */
+                 "LOOP_CHASING_STRIDENT_OUTER: \n" /* outer (counter) loop */
+                 "lea    (%[start_addr], %%r8), %%r9 \n" /* r9: access loc */
+                 "xor    %%r10, %%r10 \n" /* r10: accessed size */
+                 "xor	%%r12, %%r12 \n" /* r12: chasing index addr */
+                 "LOOP_CHASING_STRIDENT_INNER: \n"
+                 "shl    $0x06, %%r12\n"
+                 "vmovntdqa 0x0(%%r9, %%r12), %%zmm0\n"
+                 "movq   %%xmm0, %%r12\n" /* Update to next chasing element */
+                 "add    $0x40, %%r10 \n"
+
+                 "cmp    %[accesssize], %%r10 \n"
+                 "jl     LOOP_CHASING_STRIDENT_INNER \n" SIZEBTLD_FENCE
+
+                 //"mfence \n"  /* !!!! */
+                 "add    %[skip], %%r8 \n"
+                 "inc    %%r11 \n"
+                 "cmp    %[count], %%r11 \n"
+
+                 "jl     LOOP_CHASING_STRIDENT_OUTER \n"
+
+                 :
+                 : [start_addr] "r"(start_addr), [accesssize] "r"(size), [count] "r"(count), [skip] "r"(skip),
+                   [cindex] "r"(cindex)
+                 : "%r11", "%r10", "%r9", "%r8");
+    KERNEL_END
+}
+
+void cachefence(char *start_addr, long size, long cache, long fence) {
+    KERNEL_BEGIN
+    asm volatile("movq %[start_addr], %%xmm0 \n"
+                 "xor %%r9, %%r9 \n" /* r9: offset of write */
+                 "CACHEFENCE_FENCEBEGIN: \n"
+                 "xor %%r11, %%r11 \n" /* r11: fence counter */
+                 "CACHEFENCE_FLUSHBEGIN: \n"
+                 "xor %%r10, %%r10 \n" /* r10: clwb counter */
+                 //		"movq %%r9, %%rdx \n"				/* rdx: flush start offset */
+                 "leaq (%[start_addr], %%r9), %%rdx \n"
+                 "CACHEFENCE_WRITEONE: \n"
+                 "vmovdqa64  %%zmm0, 0x0(%[start_addr], %%r9) \n" /* Write one addr */
+                 "add $0x40, %%r9 \n"
+                 "add $0x40, %%r10 \n"
+                 "add $0x40, %%r11 \n"
+                 "cmp %[cache], %%r10 \n" /* check clwb */
+                 "jl CACHEFENCE_WRITEONE \n"
+
+                 "leaq (%[start_addr], %%r9), %%rcx \n" /* rcx: flush end offset, rdx->rcx */
+                 //		"add %[start_addr], %%rcx"
+                 "CACHEFENCE_FLUSHONE: \n"
+                 "clwb (%%rdx) \n" /* Flush from rdx to rcx */
+                 "add $0x40, %%rdx \n"
+                 "cmp %%rcx, %%rdx \n"
+                 "jl CACHEFENCE_FLUSHONE \n"
+
+                 "cmp %[fence], %%r11 \n"
+                 "jl CACHEFENCE_FLUSHBEGIN \n" CACHEFENCE_FENCE
+
+                 "cmp %[accesssize], %%r9 \n"
+                 "jl CACHEFENCE_FENCEBEGIN \n"
+
+                 ::[start_addr] "r"(start_addr),
+                 [accesssize] "r"(size), [cache] "r"(cache), [fence] "r"(fence)
+                 : "%rdx", "%rcx", "%r11", "%r10", "%r9");
+    KERNEL_END
+    return;
+}
+
+void cacheprobe(char *start_addr, char *end_addr, long stride) {
+    KERNEL_BEGIN
+    asm volatile("mov %[start_addr], %%r8 \n"
+                 "movq %[start_addr], %%xmm0 \n"
+                 "LOOP_CACHEPROBE: \n"
+                 "vmovdqa64 %%zmm0, 0x0(%%r8) \n"
+                 "clflush (%%r8) \n"
+                 "vmovdqa64 %%zmm0, 0x40(%%r8) \n"
+                 "clflush 0x40(%%r8) \n"
+                 "add %[stride], %%r8 \n"
+                 "cmp %[end_addr], %%r8 \n"
+                 "jl LOOP_CACHEPROBE \n"
+                 "mfence \n"
+
+                 ::[start_addr] "r"(start_addr),
+                 [end_addr] "r"(end_addr), [stride] "r"(stride)
+                 : "%r8");
+    KERNEL_END
+    return;
+}
+
+void imcprobe(char *start_addr, char *end_addr, long loop) {
+    KERNEL_BEGIN
+    asm volatile("xor %%r9, %%r9 \n"
+                 "movq %[start_addr], %%xmm0 \n"
+
+                 "LOOP1_IMCPROBE: \n"
+                 "mov %[start_addr], %%r8 \n"
+                 "LOOP2_IMCPROBE: \n"
+                 "vmovntdq %%zmm0, 0x0(%%r8) \n"
+                 "add $0x40, %%r8 \n"
+                 "cmp %[end_addr], %%r8 \n"
+                 "jl LOOP2_IMCPROBE \n"
+
+                 "add $1, %%r9 \n"
+                 "cmp %[loop], %%r9 \n"
+                 "jl LOOP1_IMCPROBE \n"
+
+                 ::[start_addr] "r"(start_addr),
+                 [end_addr] "r"(end_addr), [loop] "r"(loop)
+                 : "%r8", "%r9");
+    KERNEL_END
+    return;
+}
+
+void seq_load(char *start_addr, char *end_addr, long size) {
+    KERNEL_BEGIN
+    asm volatile("mov %[start_addr], %%r9 \n"
+
+                 "LOOP_SEQLOAD1: \n"
+                 "xor %%r8, %%r8 \n"
+                 "LOOP_SEQLOAD2: \n"
+                 "vmovntdqa 0x0(%%r9, %%r8), %%zmm0 \n"
+                 "add $0x40, %%r8 \n"
+                 "cmp %[size], %%r8 \n"
+                 "jl LOOP_SEQLOAD2 \n"
+
+                 "add %[size], %%r9 \n"
+                 "cmp %[end_addr], %%r9 \n"
+                 "jl LOOP_SEQLOAD1 \n"
+
+                 ::[start_addr] "r"(start_addr),
+                 [end_addr] "r"(end_addr), [size] "r"(size)
+                 : "%r8", "%r9");
+    KERNEL_END
+    return;
+}
+void seq_store(char *start_addr, char *end_addr, long size) {
+    KERNEL_BEGIN
+    asm volatile("mov %[start_addr], %%r9 \n"
+                 "movq %[start_addr], %%xmm0 \n"
+
+                 "LOOP_SEQSTORE1: \n"
+                 "xor %%r8, %%r8 \n"
+                 "LOOP_SEQSTORE2: \n"
+                 "vmovdqa64  %%zmm0,  0x0(%%r9, %%r8) \n"
+                 "clwb  (%%r9, %%r8) \n"
+                 "add $0x40, %%r8 \n"
+                 "cmp %[size], %%r8 \n"
+                 "jl LOOP_SEQSTORE2 \n"
+
+                 "add %[size], %%r9 \n"
+                 "cmp %[end_addr], %%r9 \n"
+                 "jl LOOP_SEQSTORE1 \n"
+
+                 ::[start_addr] "r"(start_addr),
+                 [end_addr] "r"(end_addr), [size] "r"(size)
+                 : "%r8", "%r9");
+    KERNEL_END
+    return;
+}
+
+void seq_clwb(char *start_addr, char *end_addr, long size) {
+    KERNEL_BEGIN
+    asm volatile("mov %[start_addr], %%r9 \n"
+                 "movq %[start_addr], %%xmm0 \n"
+
+                 "LOOP_SEQCLWB1: \n"
+                 "xor %%r8, %%r8 \n"
+                 "LOOP_SEQCLWB2: \n"
+                 "vmovdqa64  %%zmm0,  0x0(%%r9, %%r8) \n"
+                 "clwb  (%%r9, %%r8) \n"
+                 "add $0x40, %%r8 \n"
+                 "cmp %[size], %%r8 \n"
+                 "jl LOOP_SEQCLWB2 \n"
+
+                 "add %[size], %%r9 \n"
+                 "cmp %[end_addr], %%r9 \n"
+                 "jl LOOP_SEQCLWB1 \n"
+
+                 ::[start_addr] "r"(start_addr),
+                 [end_addr] "r"(end_addr), [size] "r"(size)
+                 : "%r8", "%r9");
+    KERNEL_END
+}
+
+void seq_nt(char *start_addr, char *end_addr, long size) {
+    KERNEL_BEGIN
+    asm volatile("mov %[start_addr], %%r9 \n"
+                 "movq %[start_addr], %%xmm0 \n"
+
+                 "LOOP_SEQNT1: \n"
+                 "xor %%r8, %%r8 \n"
+                 "LOOP_SEQNT2: \n"
+                 "vmovntdq %%zmm0, 0x0(%%r9, %%r8) \n"
+                 "add $0x40, %%r8 \n"
+                 "cmp %[size], %%r8 \n"
+                 "jl LOOP_SEQNT2 \n"
+
+                 "add %[size], %%r9 \n"
+                 "cmp %[end_addr], %%r9 \n"
+                 "jl LOOP_SEQNT1 \n"
+
+                 ::[start_addr] "r"(start_addr),
+                 [end_addr] "r"(end_addr), [size] "r"(size)
+                 : "%r8", "%r9");
+    KERNEL_END
+}
+
+struct timespec tstart, tend;
+unsigned int c_store_start_hi, c_store_start_lo;
+unsigned int c_ntload_start_hi, c_ntload_start_lo;
+unsigned int c_ntload_end_hi, c_ntload_end_lo;
+unsigned long c_store_start;
+unsigned long c_ntload_start, c_ntload_end;
+long pages, diff;
+
+		 
+#define BEFORE(buf, size, name)						\
+  asm volatile("xor %%r8, %%r8 \n" /* r8: counter */			\
+	       "FLUSH_LOOP" #name ": \n"				\
+	       "lea (%[buf], %%r8), %%r9 \n"				\
+  	       "clflush (%%r9) \n"					\
+	       "add $1, %%r8 \n"					\
+	       "cmp %[size], %%r8 \n"					\
+	       "jl FLUSH_LOOP" #name " \n"				\
+	       "mfence \n"						\
+	       :: [buf] "r" (buf), [size] "r"(size)			\
+	       : "%r8", "%r9");						\
+  clock_gettime(CLOCK_MONOTONIC_RAW, &tstart);				\
+  asm volatile("mfence \n\t"						\
+	       "rdtscp \n\t"						\
+	       "mfence \n\t"						\
+	       "mov %%edx, %[hi]\n\t"					\
+	       "mov %%eax, %[lo]\n\t"					\
+	       : [hi] "=r"(c_store_start_hi), [lo] "=r"(c_store_start_lo) \
+	       :							\
+	       : "rdx", "rax", "rcx");
+
+
+#define AFTER								\
+  asm volatile("mfence \n\t"						\
+	       "rdtscp \n\t"						\
+	       "mfence \n\t"						\
+	       "mov %%edx, %[hi]\n\t"					\
+	       "mov %%eax, %[lo]\n\t"					\
+	       : [hi] "=r"(c_ntload_end_hi), [lo] "=r"(c_ntload_end_lo)	\
+	       :							\
+	       : "rdx", "rax", "rcx");					\
+  if (clock_gettime(CLOCK_MONOTONIC_RAW, &tend) == 0) {			\
+    diff = (tend.tv_sec - tstart.tv_sec) * 1e9 + tend.tv_nsec - tstart.tv_nsec; \
+  }									\
+  c_store_start = (((unsigned long)c_store_start_hi) << 32) | c_store_start_lo; \
+  c_ntload_start = (((unsigned long)c_ntload_start_hi) << 32) | c_ntload_start_lo; \
+  c_ntload_end = (((unsigned long)c_ntload_end_hi) << 32) | c_ntload_end_lo;
+
+
+
+#define LFS_PERMRAND_ENTRIES 0x1000
+#define RAW_BEFORE_WRITE                                                                                               \
+    clock_gettime(CLOCK_MONOTONIC_RAW, &tstart);                                                                       \
+    asm volatile("rdtscp \n\t"                                                                                         \
+                 "lfence \n\t"                                                                                         \
+                 "mov %%edx, %[hi]\n\t"                                                                                \
+                 "mov %%eax, %[lo]\n\t"                                                                                \
+                 : [hi] "=r"(c_store_start_hi), [lo] "=r"(c_store_start_lo)                                            \
+                 :                                                                                                     \
+                 : "rdx", "rax", "rcx");
+#define RAW_BEFORE_READ                                                                                                \
+    asm volatile("rdtscp \n\t"                                                                                         \
+                 "lfence \n\t"                                                                                         \
+                 "mov %%edx, %[hi]\n\t"                                                                                \
+                 "mov %%eax, %[lo]\n\t"                                                                                \
+                 : [hi] "=r"(c_ntload_start_hi), [lo] "=r"(c_ntload_start_lo)                                          \
+                 :                                                                                                     \
+                 : "rdx", "rax", "rcx");
+#define RAW_FINAL(job_name)						\
+  asm volatile("lfence \n\t"						\
+	       "rdtscp \n\t"						\
+	       "lfence \n\t"						\
+	       "mov %%edx, %[hi]\n\t"					\
+	       "mov %%eax, %[lo]\n\t"					\
+	       : [hi] "=r"(c_ntload_end_hi), [lo] "=r"(c_ntload_end_lo)	\
+	       :							\
+	       : "rdx", "rax", "rcx");					\
+  if (clock_gettime(CLOCK_MONOTONIC_RAW, &tend) == 0) {			\
+    diff = (tend.tv_sec - tstart.tv_sec) * 1e9 + tend.tv_nsec - tstart.tv_nsec; \
+  }									\
+  c_store_start = (((unsigned long)c_store_start_hi) << 32) | c_store_start_lo; \
+  c_ntload_start = (((unsigned long)c_ntload_start_hi) << 32) | c_ntload_start_lo; \
+  c_ntload_end = (((unsigned long)c_ntload_end_hi) << 32) | c_ntload_end_lo;
diff --git a/script/collect_weights.py b/script/collect_weights.py
new file mode 100644
index 0000000..4d2c97d
--- /dev/null
+++ b/script/collect_weights.py
@@ -0,0 +1,15 @@
+import os, subprocess
+import time
+workloads = ["mlc","ld","st","nt-ld","nt-st","ptr-chasing"]
+
+
+def batch_run():
+    os.system("../cmake-build-debug/CXLMemSim")
+    
+def run_command(size):
+    start_time = time.time()
+    cmd = ["../cmake-build-debug/CXLMemSim" ,"-s"]
+    print(cmd)
+    subprocess.run(cmd)
+    end_time = time.time()
+    return end_time - start_time
diff --git a/script/dump_pmu.py b/script/dump_pmu.py
new file mode 100644
index 0000000..dcc2aea
--- /dev/null
+++ b/script/dump_pmu.py
@@ -0,0 +1,53 @@
+import csv
+import matplotlib.pyplot as plt
+import os
+import json
+
+pmu_list = ["INST_RETIRED.ANY"]
+pmu_core_after = {"INST_RETIRED.ANY": (0, 0)}
+pmu_core_before = {"INST_RETIRED.ANY": (0, 0)}
+
+
+def get_perfmon(path: str, pmu: list) -> dict:
+    data_dict = {}
+    cur_csv = json.loads(f.read())
+
+    with open(path, "r") as f:
+        for line in pmu:
+            # Extract the EventName, UMask, and EventCode
+            event = cur_csv["Events"][0]
+            event_name = event["EventName"]
+            umask = event["UMask"]
+            event_code = event["EventCode"]
+
+            # Combine UMask and EventCode
+            combined_code = (
+                umask + event_code[2:]
+            )  # Concatenate and remove '0x' from EventCode
+            combined_code_hex = (
+                "0x" + combined_code[2:]
+            )  # Add '0x' back for hex representation
+
+            # Print the results
+            print(f"Event Name: {event_name}")
+            print(f"Combined UMask and EventCode: {combined_code_hex}")
+    return data_dict
+
+
+def batch_pmu_run(pmu: dict):
+    for i, p in enumerate(pmu):
+        print(p)
+        if i % 4 == 0:
+            os.system(
+                "../cmake-build-debug/CXLMemSim -t ../cmake-build-debug/microbench/ld2 -i 100 --p"+ 
+            )
+            os.system("mv ./output_pmu.csv ./ld_pmu2_results.csv")
+
+
+if __name__ == "__main__":
+    pmu = {"INST_RETIRED.ANY": 0}
+    get_perfmon("./perfmon/SPR/events/sapphirerapids_core.json", pmu)
+    get_perfmon("./perfmon/SPR/events/sapphirerapids_uncore.json", pmu)
+    get_perfmon("./perfmon/SPR/events/sapphirerapids_uncore_experimental.json", pmu)
+    # x, y = load_csv('data.csv')
+    # draw_graph(x, y)
diff --git a/script/get_all_results.py b/script/get_all_results.py
new file mode 100644
index 0000000..e69de29
diff --git a/script/ld_base_result.py b/script/ld_base_result.py
new file mode 100644
index 0000000..b157c82
--- /dev/null
+++ b/script/ld_base_result.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+
+import subprocess
+import time
+import csv
+import sys, os
+
+workloads = ["mlc", "ld", "st", "nt-ld", "nt-st", "ptr-chasing"]
+
+
+def run_command(size, mem_node):
+    start_time = time.time()
+    cmd = [
+        f"/usr/bin/numactl -m {mem_node} ../cmake-build-debug/microbench/ld_base" + str(size),
+    ]
+    print(cmd)
+    process = subprocess.Popen(
+        cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+
+    out, err = process.communicate()
+    print(f"err: {err}, out: {out}")
+    return int(out)
+
+
+def run_cxlmemsim_command(size, mem_node):
+    # start_time = time.time()
+    cmd = [
+        "LOGV=1",
+        f"/usr/bin/numactl -m {mem_node}",
+        "../cmake-build-debug/CXLMemSim",
+        "-t",
+        "../cmake-build-debug/microbench/ld" + str(size),
+        "-i",
+        "100",
+    ]
+    cmd = " ".join(cmd)
+    print(cmd)
+    os.system(cmd)
+    # end_time = time.time()
+    df = pd.read_csv("./output_pmu.csv")
+    os.system(f"mv ./output_pmu.csv ./ld_pmu{size}_results.csv")
+    return df
+
+def execute(cmd):
+    print(cmd)
+    process = subprocess.Popen(
+        cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+
+    out, err = process.communicate()
+    print(f"err: {err}, out: {out}")
+    return out
+    
+
+def main():
+    prefetching_off = [f"wrmsr -a 0x1a4 0xf"]
+    prefetching_on = [f"wrmsr -a 0x1a4 0xf"]
+    
+    sizes = [2**x for x in range(0, 9)]
+
+    mode = "local"
+    mem_node = 0 if mode == "local" else 1
+
+
+    execute(prefetching_off)
+    
+    f = open(f"ld_base_results_{mode}.csv", "a")
+    writer = csv.writer(f, delimiter=",")
+    writer.writerow(["size", "time"])
+    for i in range(5):
+        for size in sizes:
+            exec_time = run_command(size, mem_node)
+            writer.writerow([size, exec_time])
+
+    execute(prefetching_on)
+    # for size in sizes:
+    #     df = run_cxlmemsim_command(size,1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/script/ld_plot_result.py b/script/ld_plot_result.py
new file mode 100644
index 0000000..d314146
--- /dev/null
+++ b/script/ld_plot_result.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+
+import argparse
+import subprocess
+import time
+from math import sqrt
+import matplotlib.pyplot as plt
+import pandas as pd
+
+workloads = ["mlc", "ld", "st", "nt-ld", "nt-st", "ptr-chasing"]
+pmus = [
+    "mon0_tatal_stall_0_0",
+    "mon0_all_dram_rds_0_0",
+    "mon0_l2stall_0_0",
+    "mon0_snoop_fw_wb_0_0",
+    "mon0_llcl_hits_0_0",
+    "mon0_llcl_miss_0_0",
+    "mon0_null_0_0",
+    "mon0_null_0_0",
+]
+
+
+def get_mean_and_ebars(df, groups, select):
+    """returns df with error bars. gropus includes columns to groupby"""
+    agg = df.groupby(groups)[select].agg(["mean", "count", "std"])
+    error = []
+    for i in agg.index:
+        mean, count, std = agg.loc[i]
+        error.append(1.95 * std / sqrt(count))
+
+    agg["error"] = error
+
+    return agg[["mean"]], agg[["error"]]
+
+
+def print_pmu_csv():
+    sizes = [2**x for x in range(0, 9)]
+    for c in pmus:
+        for i in sizes:
+            df = pd.read_csv(f"ld_pmu{i}_results.csv")
+            col = df[df[c]<1844674407][c]
+            print(col)
+            if col[11] == "0":
+                print(col)
+                continue
+            # Plotting the data
+            plt.plot(col, marker="o", linestyle="-", label=i)
+
+        # Adding title and labels
+        plt.title("PMU Plot for ld")
+        plt.xlabel("PMU gathered per epoch")
+        plt.ylabel(f"{c} Values")
+        plt.legend()
+        plt.savefig(f"ld_results_pmu_{c}.png")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="plot results.")
+    # parser.add_argument(
+    #     "-f", "--file_name", nargs="?", default="ld_results.csv",
+    #     help="csv containing results.")
+
+    # args = parser.parse_args()
+
+    # df = pd.read_csv(args.file_name)
+    # means, error = get_mean_and_ebars(df, ["size"], "time")
+
+    # fig,ax = plt.subplots()
+
+    # ax.errorbar(means.index, means["mean"],yerr=error["error"], capsize=4)
+    # #means.plot(ax=ax, yerr=error, grid=True, rot=0, capsize=4)
+    # ax.set_xlabel("Size")
+    # ax.set_ylabel("Execution Time (seconds)")
+    # print(error)
+
+    # fig.savefig("ld_results.png")
+    print_pmu_csv()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/script/ld_result.py b/script/ld_result.py
new file mode 100644
index 0000000..9fb5601
--- /dev/null
+++ b/script/ld_result.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+
+import subprocess
+import time
+import csv
+import sys, os
+
+workloads = ["mlc", "ld", "st", "nt-ld", "nt-st", "ptr-chasing"]
+
+
+def run_command(size, mem_node):
+    start_time = time.time()
+    cmd = [
+        f"/usr/bin/numactl -m {mem_node} ../cmake-build-debug/microbench/ld" + str(size),
+    ]
+    print(cmd)
+    process = subprocess.Popen(
+        cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+
+    out, err = process.communicate()
+    print(f"err: {err}, out: {out}")
+    return int(out)
+
+
+def run_cxlmemsim_command(size, mem_node):
+    # start_time = time.time()
+    cmd = [
+        "LOGV=1",
+        f"/usr/bin/numactl -m {mem_node}",
+        "../cmake-build-debug/CXLMemSim",
+        "-t",
+        "../cmake-build-debug/microbench/ld" + str(size),
+        "-i",
+        "100",
+    ]
+    cmd = " ".join(cmd)
+    print(cmd)
+    os.system(cmd)
+    # end_time = time.time()
+    df = pd.read_csv("./output_pmu.csv")
+    os.system(f"mv ./output_pmu.csv ./ld_pmu{size}_results.csv")
+    return df
+
+def execute(cmd):
+    print(cmd)
+    process = subprocess.Popen(
+        cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+
+    out, err = process.communicate()
+    print(f"err: {err}, out: {out}")
+    return out
+
+
+def main():
+    prefetching_off = [f"wrmsr -a 0x1a4 0xf"]
+    prefetching_on = [f"wrmsr -a 0x1a4 0xf"]
+
+    sizes = [2**x for x in range(0, 9)]
+
+    mode = "remote"
+    mem_node = 0 if mode == "local" else 1
+
+
+    execute(prefetching_off)
+    f = open(f"ld_results_{mode}_noprefetch.csv", "a")
+    writer = csv.writer(f, delimiter=",")
+    writer.writerow(["size", "time"])
+    for i in range(10):
+        for size in sizes:
+            exec_time = run_command(size, mem_node)
+            writer.writerow([size, exec_time])
+
+    execute(prefetching_on)            
+    # for size in sizes:
+    #     df = run_cxlmemsim_command(size,1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/script/st_result.py b/script/st_result.py
new file mode 100644
index 0000000..322b75e
--- /dev/null
+++ b/script/st_result.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+
+import subprocess
+import time
+import csv
+import sys, os
+import pandas as pd
+
+workloads = ["mlc", "ld", "st", "nt-ld", "nt-st", "ptr-chasing"]
+
+
+def run_command(size, mem_node):
+    start_time = time.time()
+    cmd = [
+        f"/usr/bin/numactl -m {mem_node} ../cmake-build-debug/microbench/st" + str(size),
+    ]
+    print(cmd)
+    process = subprocess.Popen(
+        cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+
+    out, err = process.communicate()
+    print(f"err: {err}, out: {out}")
+    return int(out)
+
+
+def run_cxlmemsim_command(size, mem_node):
+    # start_time = time.time()
+    cmd = [
+        "LOGV=1",
+        f"/usr/bin/numactl -m {mem_node}",
+        "../cmake-build-debug/CXLMemSim",
+        "-t",
+        "../cmake-build-debug/microbench/st" + str(size),
+        "-i",
+        "100",
+    ]
+    cmd = " ".join(cmd)
+    print(cmd)
+    os.system(cmd)
+    # end_time = time.time()
+    df = pd.read_csv("./output_pmu.csv")
+    os.system(f"mv ./output_pmu.csv ./st_pmu{size}_results.csv")
+    return df
+
+
+def main():
+    sizes = [2**x for x in range(0, 9)]
+
+    mode = "remote"
+    mem_node = 0 if mode == "local" else 1
+
+        
+    f = open(f"st_results_{mode}.csv", "a")
+
+    writer = csv.writer(f, delimiter=",")
+
+    writer.writerow(["size", "time"])
+    for i in range(25):
+        for size in sizes:
+            exec_time = run_command(size, mem_node)
+            writer.writerow([size, exec_time])
+
+    # for size in sizes:
+    #     df = run_cxlmemsim_command(size,1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/script/wb_result.py b/script/wb_result.py
new file mode 100644
index 0000000..e88208f
--- /dev/null
+++ b/script/wb_result.py
@@ -0,0 +1,104 @@
+import subprocess
+import time
+import matplotlib.pyplot as plt
+import pandas as pd
+import os, csv
+import re
+
+workloads = ["mlc", "ld", "st", "nt-ld", "nt-st", "ptr-chasing"]
+
+
+def run_command(size, mem_node):
+    start_time = time.time()
+    cmd = [
+        f"/usr/bin/numactl -m {mem_node} ../../MLC/Linux/mlc  --loaded_latency -W"
+        + str(size),
+    ]
+    print(cmd)
+    process = subprocess.Popen(
+        cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    print(f"err: {err}, out: {out}")
+
+    out, err = process.communicate()
+
+    regex_pattern = r"\t(\d+)\.\d+\t\s*(\d+)\.\d+"
+
+    # Find all matches
+    matches = re.findall(regex_pattern, out)
+
+    print(f"err: {err}, out: {matches}")
+    return int(out)
+
+
+def run_cxlmemsim_command(size, mem_node):
+    # start_time = time.time()
+    cmd = [
+        "LOGV=1",
+        f"/usr/bin/numactl -m {mem_node}",
+        "../cmake-build-debug/CXLMemSim",
+        "-t",
+        f"'../../MLC/Linux/mlc  --loaded_latency -W{size}'",
+        "-i",
+        "100",
+        "-c",
+        '0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23'
+    ]
+    cmd = " ".join(cmd)
+    print(cmd)
+    os.system(cmd)
+    # end_time = time.time()
+    df = pd.read_csv("./output_pmu.csv")
+    os.system(f"mv ./output_pmu.csv ./wb_pmu{size}_results.csv")
+    return df
+
+
+def main():
+    sizes = [x for x in range(2, 12)]
+
+    mode = "remote"
+    mem_node = 0 if mode == "local" else 1
+
+    inject_latency = [
+        "00000",
+        "00002",
+        "00008",
+        "00015",
+        "00050",
+        "00100",
+        "00200",
+        "00300",
+        "00400",
+        "00500",
+        "00700",
+        "01000",
+        "01300",
+        "01700",
+        "02500",
+        "03500",
+        "05000",
+        "09000",
+        "20000",
+    ]
+    writer = []
+    for latency in inject_latency:
+        f = open(f"wb_results_{mode}_{latency}.csv", "a")
+        writer.append(csv.writer(f, delimiter=","))
+
+        writer[-1].writerow(["size", "time", "bw"])
+    # f = open(f"wb_results_{mode}.csv", "a")
+    # writer.append(csv.writer(f, delimiter=","))
+
+    # writer.writerow(["size", "time", "bw"])
+    for i in range(25):
+        for size in sizes:
+            exec_time = run_command(size, mem_node)
+
+            writer.writerow([size, exec_time])
+
+    # for size in sizes:
+    #     df = run_cxlmemsim_command(size,1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/collectmmap.c b/src/collectmmap.c
deleted file mode 100644
index d24e3da..0000000
--- a/src/collectmmap.c
+++ /dev/null
@@ -1,85 +0,0 @@
-#include <linux/filter.h>
-#include <linux/ptrace.h>
-#include <linux/version.h>
-#include <uapi/linux/bpf.h>
-
-/* helper macro to place programs, maps, license in
- * different sections in elf_bpf file. Section names
- * are interpreted by elf_bpf loader
- */
-#define SEC(NAME) __attribute__((section(NAME), used))
-
-/* helper functions called from eBPF programs written in C */
-static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) = (void *)BPF_FUNC_probe_read;
-static unsigned long long (*bpf_ktime_get_ns)(void) = (void *)BPF_FUNC_ktime_get_ns;
-static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) = (void *)BPF_FUNC_trace_printk;
-static unsigned long long (*bpf_get_current_pid_tgid)(void) = (void *)BPF_FUNC_get_current_pid_tgid;
-/* a helper structure used by eBPF C program
- * to describe map attributes to elf_bpf loader
- */
-struct bpf_map_def {
-    unsigned int type;
-    unsigned int key_size;
-    unsigned int value_size;
-    unsigned int max_entries;
-    unsigned int map_flags;
-    unsigned int inner_map_idx;
-};
-#define PT_REGS_PARM1(x) ((x)->di)
-#define PT_REGS_PARM2(x) ((x)->si)
-SEC("kprobe/__x64_sys_munmap")
-int munmap_init(struct pt_regs *ctx) {
-    long size;
-    long address;
-    char fmt[] = "munmap %ld %ld %u\n";
-    u32 pid = bpf_get_current_pid_tgid();
-    bpf_probe_read(&size, sizeof(size), (void *)&PT_REGS_PARM2(ctx));
-    bpf_probe_read(&address, sizeof(address), (void *)&PT_REGS_PARM1(ctx));
-    if (size > 0) {
-        bpf_trace_printk(fmt, sizeof(fmt), size, address, pid);
-    }
-    return 0;
-}
-SEC("kprobe/__x64_sys_brk")
-int brk_init(struct pt_regs *ctx) {
-    long address;
-    char fmt[] = "brk %ld %u\n";
-    u32 pid = bpf_get_current_pid_tgid();
-    bpf_probe_read(&address, sizeof(address), (void *)&PT_REGS_PARM1(ctx));
-    bpf_trace_printk(fmt, sizeof(fmt), address, pid);
-    return 0;
-}
-SEC("kretprobe/__x64_sys_brk")
-int brk_finish(struct pt_regs *ctx) {
-    int size;
-    char fmt[] = "brkret %d %u\n";
-    u32 pid = bpf_get_current_pid_tgid();
-    bpf_probe_read(&size, sizeof(size), (void *)&PT_REGS_PARM1(ctx));
-    if (size > 0) {
-        bpf_trace_printk(fmt, sizeof(fmt), size, pid);
-    }
-    return 0;
-}
-SEC("kprobe/__x64_sys_sbrk")
-int sbrk_init(struct pt_regs *ctx) {
-    int size;
-    char fmt[] = "sbrkret %d %u\n";
-    u32 pid = bpf_get_current_pid_tgid();
-    bpf_probe_read(&size, sizeof(size), (void *)&PT_REGS_PARM1(ctx));
-    if (size > 0) {
-        bpf_trace_printk(fmt, sizeof(fmt), size, pid);
-    }
-    return 0;
-}
-SEC("kretprobe/__x64_sys_sbrk")
-int sbrk_finish(struct pt_regs *ctx) {
-    long address;
-    char fmt[] = "sbrkret %ld %u\n";
-    u32 pid = bpf_get_current_pid_tgid();
-    bpf_probe_read(&address, sizeof(address), (void *)&PT_REGS_PARM1(ctx));
-    bpf_trace_printk(fmt, sizeof(fmt), address, pid);
-
-    return 0;
-}
-char _license[] SEC("license") = "GPL";
-u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/src/cxlcontroller.cpp b/src/cxlcontroller.cpp
index b40aa9e..e21f086 100644
--- a/src/cxlcontroller.cpp
+++ b/src/cxlcontroller.cpp
@@ -32,14 +32,18 @@ void CXLController::construct_topo(std::string_view newick_tree) {
     }
 }
 
-CXLController::CXLController(Policy *p, int capacity, bool is_page, int epoch)
-    : CXLSwitch(0), capacity(capacity), policy(p), is_page(is_page) {
+CXLController::CXLController(AllocationPolicy *p, int capacity, enum page_type page_type_, int epoch)
+    : CXLSwitch(0), capacity(capacity), policy(p), page_type_(static_cast<page_type>(page_type_)) {
     for (auto switch_ : this->switches) {
         switch_->set_epoch(epoch);
     }
     for (auto expander : this->expanders) {
         expander->set_epoch(epoch);
     }
+    // TODO get LRU wb
+    // TODO BW type series
+
+    // deferentiate R/W for multireader multi writer
 }
 
 double CXLController::calculate_latency(LatencyPass elem) { return CXLSwitch::calculate_latency(elem) * 1000; }
@@ -119,3 +123,10 @@ std::tuple<double, std::vector<uint64_t>> CXLController::calculate_congestion()
     return CXLSwitch::calculate_congestion();
 }
 void CXLController::set_epoch(int epoch) { CXLSwitch::set_epoch(epoch); }
+// TODO: impl me
+MigrationPolicy::MigrationPolicy() {
+
+}
+PagingPolicy::PagingPolicy() {
+    
+}
diff --git a/src/cxlendpoint.cpp b/src/cxlendpoint.cpp
index 795380a..324c798 100644
--- a/src/cxlendpoint.cpp
+++ b/src/cxlendpoint.cpp
@@ -5,7 +5,7 @@
 #include "cxlendpoint.h"
 
 CXLMemExpander::CXLMemExpander(int read_bw, int write_bw, int read_lat, int write_lat, int id, int capacity)
-    : capacity(capacity), id(id) {
+    : capacity(capacity), id(id), lru_cache(capacity / 1000 / 64) {
     this->bandwidth.read = read_bw;
     this->bandwidth.write = write_bw;
     this->latency.read = read_lat;
@@ -14,8 +14,8 @@ CXLMemExpander::CXLMemExpander(int read_bw, int write_bw, int read_lat, int writ
 double CXLMemExpander::calculate_latency(LatencyPass lat) {
     auto all_access = lat.all_access;
     auto dramlatency = lat.dramlatency;
-    auto ma_ro = lat.ma_ro;
-    auto ma_wb = lat.ma_wb;
+    auto ma_ro = lat.readonly;
+    auto ma_wb = lat.writeback;
     auto all_read = std::get<0>(all_access);
     auto all_write = std::get<1>(all_access);
     double read_sample = 0.;
@@ -26,6 +26,23 @@ double CXLMemExpander::calculate_latency(LatencyPass lat) {
     if (all_write != 0) {
         write_sample = ((double)last_write / all_write);
     }
+    uint64_t mastall_wb = 0; 
+    uint64_t mastall_ro = 0; 
+    /**     If both target_llchits and target_llcmiss are 0, it means that hit in L2.
+     *     Stall by LLC misses is 0.
+     *     choose by vector */
+
+    //    mastall_wb = (double)(target_l2stall / frequency) *
+    //                 ((double)(weight * llcmiss_wb) / (double)(target_llchits + (weight * target_llcmiss))) * 1000;
+    //    // weight is a delay specific value current pro
+    //    mastall_ro = (double)(target_l2stall / frequency) *
+    //                 ((double)(weight * llcmiss_ro) / (double)(target_llchits + (weight * target_llcmiss))) *
+    //                 1000; // weight is a delay specific value
+    //    LOG(DEBUG) << fmt::format("l2stall={}, mastall_wb={}, mastall_ro={}, target_llchits={}, target_llcmiss={}\n",
+    //                              target_l2stall, mastall_wb, mastall_ro, target_llchits, target_llcmiss);
+
+    auto writeback = (double)mastall_wb / dramlatency;
+    auto readonly = (double)mastall_ro / dramlatency;
     this->last_latency =
         ma_ro * read_sample * (latency.read - dramlatency) + ma_wb * write_sample * (latency.write - dramlatency);
     return this->last_latency;
@@ -43,7 +60,7 @@ double CXLMemExpander::calculate_bandwidth(BandwidthPass bw) {
     if (all_read != 0) {
         read_sample = ((double)last_read / all_read);
     }
-    double write_sample = 0.;
+    double write_sample = 0.; // based on time series
     if (all_write != 0) {
         write_sample = ((double)last_write / all_write);
     }
@@ -51,13 +68,13 @@ double CXLMemExpander::calculate_bandwidth(BandwidthPass bw) {
         ((double)bandwidth.read)) {
         res +=
             read_sample * 64 * read_config / 1024 / 1024 / (this->epoch + this->last_latency) * 1000 / bandwidth.read -
-            this->epoch * 0.001;
+            this->epoch * 0.001; // TODO: read
     }
     if ((((double)write_sample * 64 * write_config) / 1024 / 1024 / (this->epoch + this->last_latency) * 1000) >
         bandwidth.write) {
         res += (((double)write_sample * 64 * write_config) / 1024 / 1024 / (this->epoch + this->last_latency) * 1000 /
                 bandwidth.write) -
-               this->epoch * 0.001;
+               this->epoch * 0.001; // TODO: wb+clflush
     }
     return res;
 }
@@ -89,6 +106,7 @@ void CXLMemExpander::delete_entry(uint64_t addr, uint64_t length) {
 }
 
 int CXLMemExpander::insert(uint64_t timestamp, uint64_t phys_addr, uint64_t virt_addr, int index) {
+
     if (index == this->id) {
         last_timestamp = last_timestamp > timestamp ? last_timestamp : timestamp; // Update the last timestamp
         // Check if the address is already in the map)
@@ -185,10 +203,11 @@ double CXLSwitch::calculate_bandwidth(BandwidthPass elem) {
     for (auto &switch_ : this->switches) {
         bw += switch_->calculate_bandwidth(elem);
     }
+    // time series
     return bw;
 }
 int CXLSwitch::insert(uint64_t timestamp, uint64_t phys_addr, uint64_t virt_addr, int index) {
-    for (auto &expander : this->expanders) {
+    for (auto &expander : this->expanders) { // differ read and write。
         auto ret = expander->insert(timestamp, phys_addr, virt_addr, index);
         if (ret == 1) {
             this->counter.inc_store();
@@ -212,6 +231,7 @@ int CXLSwitch::insert(uint64_t timestamp, uint64_t phys_addr, uint64_t virt_addr
             return 0;
         }
     }
+    return 0;
 }
 std::tuple<double, std::vector<uint64_t>> CXLSwitch::calculate_congestion() {
     double latency = 0.0;
diff --git a/src/helper.cpp b/src/helper.cpp
index 0fdb9d6..580ed43 100644
--- a/src/helper.cpp
+++ b/src/helper.cpp
@@ -2,260 +2,102 @@
 // Created by victoryang00 on 1/12/23.
 //
 #include "helper.h"
-#include "logging.h"
-
-const struct ModelContext model_ctx[] = {{CPU_MDL_BDX,
-                                          {"/sys/bus/event_source/devices/uncore_cbox_%u/type",
-                                           /*
-                                            * cbo_config:
-                                            *    unc_c_llc_victims.m_state
-                                            *    umask=0x1,event=0x37
-                                            */
-                                           0x0137,
-                                           /*
-                                            * all_dram_rds_config:
-                                            *   offcore_response.all_reads.llc_miss.local_dram
-                                            *   cpu/umask=0x1,event=0xb7,offcore_rsp=0x40007f7/
-                                            */
-                                           0x01b7, 0x6040007f7,
-                                           /*
-                                            * cpu_l2stall_config:
-                                            *   cycle_activity.stalls_l2_pending
-                                            *   cpu/umask=0x5,cmask=0x5,event=0xa3/
-                                            */
-                                           0x50005a3,
-                                           /*
-                                            * cpu_llcl_hits_config:
-                                            *   mem_load_uops_l3_hit_retired.xsnp_none
-                                            *   cpu/umask=0x8,event=0xd2/
-                                            */
-                                           0x08d2,
-                                           /*
-                                            * cpu_llcl_miss_config:
-                                            *   mem_load_uops_l3_miss_retired.local_dram
-                                            *   cpu/umask=0x1,event=0xd3/
-                                            */
-                                           0x01d3,
-                                           /*
-                                            * cpu_bandwidth_read_config:
-                                            *   UNC_M_CAS_COUNT.RD * 64
-                                            *   cpu/umask=0x03,event=0x04/
-                                            */
-                                           0x0304,
-                                           /*
-                                            * cpu_bandwidth_write_config:
-                                            *   UNC_M_CAS_COUNT.WR * 64
-                                            *   cpu/umask=0x0c,event=0x04/
-                                            */
-                                           0x0c04}},
-                                         {CPU_MDL_SKX,
-                                          {"/sys/bus/event_source/devices/uncore_cha_%u/type",
-                                           /*
-                                            * cbo_config:
-                                            *   UNC_C_LLC_VICTIMS
-                                            *   umask=0x21,event=37
-                                            */
-                                           0x2137,
-                                           /*
-                                            * all_dram_rds_config:
-                                            *   OCR.ALL_READS.L3_MISS.SNOOP_NONE
-                                            *   cpu/umask=0x1,event=0xb7,offcore_rsp=0xBC408000/
-                                            */
-                                           0x01b7, 0xBC408000,
-                                           /*
-                                            * cpu_l2stall_config:
-                                            *   cycle_activity.stalls_l2_miss
-                                            *   cpu/umask=0x5,cmask=0x5,event=0xa3/
-                                            */
-                                           0x50005a3,
-                                           /*
-                                            * cpu_llcl_hits_config:
-                                            *   mem_load_l3_hit_retired.xsnp_none
-                                            *   cpu/umask=0x8,event=0xd2/
-                                            */
-                                           0x08d2,
-                                           /*
-                                            * cpu_llcl_miss_config:
-                                            *   mem_load_l3_miss_retired.local_dram
-                                            *   cpu/umask=0x1,event=0xd3/
-                                            */
-                                           0x01d3,
-                                           /*
-                                            * cpu_bandwidth_read_config:
-                                            *   UNC_M_CAS_COUNT.RD * 64
-                                            *   cpu/umask=0x03,event=0x04/
-                                            */
-                                           0x0304,
-                                           /*
-                                            * cpu_bandwidth_write_config:
-                                            *   UNC_M_CAS_COUNT.WR * 64
-                                            *   cpu/umask=0x0c,event=0x04/
-                                            */
-                                           0x0c04}},
-                                         {CPU_MDL_SPR,
-                                          {"/sys/bus/event_source/devices/uncore_cha_%u/type",
-                                           /*
-                                            * cbo_config:
-                                            *   UNC_C_LLC_VICTIMS => OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD
-                                            *   umask=0x10,event=b0
-                                            */
-                                           0x10b0,
-                                           /*
-                                            * all_dram_rds_config:
-                                            *   OCR.ALL_READS.L3_MISS.SNOOP_NONE => L3_MISS.SNOOP_MISS_OR_NO_FWD
-                                            *   cpu/umask=0x1,event=0xb7,offcore_rsp=0x63FC00491/
-                                            */
-                                           0x01b7, 0x63FC00491,
-                                           /*
-                                            * cpu_l2stall_config:
-                                            *   cycle_activity.stalls_l2_miss
-                                            *   cpu/umask=0x5,cmask=0x5,event=0xa3/
-                                            */
-                                           0x50005a3,
-                                           /*
-                                            * cpu_llcl_hits_config:
-                                            *   mem_load_l3_hit_retired.xsnp_none
-                                            *   cpu/umask=0x8,event=0xd2/
-                                            */
-                                           0x08d2,
-                                           /*
-                                            * cpu_llcl_miss_config:
-                                            *   mem_load_l3_miss_retired.local_dram
-                                            *   cpu/umask=0x1,event=0xd3/
-                                            */
-                                           0x01d3,
-                                           /*
-                                            * cpu_bandwidth_read_config:
-                                            *   UNC_M_CAS_COUNT.RD * 64
-                                            *   cpu/umask=0xcf,event=0x05/
-                                            */
-                                           0xcf05,
-                                           /*
-                                            * cpu_bandwidth_write_config:
-                                            *   UNC_M_CAS_COUNT.WR * 64
-                                            *   cpu/umask=0xf0,event=0x05/
-                                            */
-                                           0xf005}},
-                                         {CPU_MDL_ADL,
-                                          {"/sys/bus/event_source/devices/uncore_cbox_%u/type",
-                                           /*
-                                            * cbo_config:
-                                            *   UNC_C_LLC_VICTIMS => OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD
-                                            *   umask=0x21,event=10
-                                            */
-                                           0x2110,
-                                           /*
-                                            * all_dram_rds_config:
-                                            *   OCR.ALL_READS.L3_MISS.SNOOP_NONE => OCR.DEMAND_DATA_RD.L3_MISS
-                                            *   cpu/umask=0x1,event=0x2A,offcore_rsp=0x3FBFC00001/
-                                            */
-                                           0x012a, 0x3fbfc00001,
-                                           /*
-                                            * cpu_l2stall_config:
-                                            *   cycle_activity.stalls_l2_miss
-                                            *   cpu/umask=0x5,cmask=0x5,event=0xa3/
-                                            */
-                                           0x50005a3,
-                                           /*
-                                            * cpu_llcl_hits_config:
-                                            *   mem_load_l3_hit_retired.xsnp_none
-                                            *   cpu/umask=0x8,event=0xd2/
-                                            */
-                                           0x08d2,
-                                           /*
-                                            * cpu_llcl_miss_config:
-                                            *   mem_load_l3_miss_retired.local_dram
-                                            *   cpu/umask=0x1,event=0xd3/
-                                            */
-                                           0x01d3,
-                                           /*
-                                            * cpu_bandwidth_read_config:
-                                            *   UNC_M_CAS_COUNT.RD * 64
-                                            *   cpu/umask=0xcf,event=0x05/
-                                            */
-                                           0xcf05,
-                                           /*
-                                            * cpu_bandwidth_write_config:
-                                            *   UNC_M_CAS_COUNT.WR * 64
-                                            *   cpu/umask=0xf0,event=0x05/
-                                            */
-                                           0xf005}},
-                                         {CPU_MDL_END, {0}}};
+#include <string>
+#include <vector>
+
+struct ModelContext model_ctx[] = {{CPU_MDL_BDX,
+                                    {
+                                        "/sys/bus/event_source/devices/uncore_cbo_%u/type",
+                                    }},
+                                   {CPU_MDL_SKX,
+                                    {
+                                        "/sys/bus/event_source/devices/uncore_cha_%u/type",
+                                    }},
+                                   {CPU_MDL_SPR,
+                                    {
+                                        "/sys/bus/event_source/devices/uncore_cha_%u/type",
+                                    }},
+                                   {CPU_MDL_ADL,
+                                    {
+                                        "/sys/bus/event_source/devices/uncore_cbo_%u/type",
+                                    }},
+                                   {CPU_MDL_END, {""}}};
 
 int Helper::num_of_cpu() {
-    int ncpu;
-    ncpu = sysconf(_SC_NPROCESSORS_ONLN);
+    int ncpu = sysconf(_SC_NPROCESSORS_ONLN);
     if (ncpu < 0) {
         LOG(ERROR) << "sysconf";
     }
-    LOG(DEBUG) << fmt::format("num_of_cpu={}\n", ncpu);
     return ncpu;
 }
 
-int Helper::num_of_cbo() {
-    int ncbo = 0;
-    for (; ncbo < 128; ++ncbo) {
-        std::string path = fmt::format("/sys/bus/event_source/devices/uncore_cbox_{}/type", ncbo);
-        // LOG(DEBUG) << path;
+int Helper::num_of_cha() {
+    int ncha = 0;
+    for (; ncha < cpu; ++ncha) {
+        std::string path = fmt::format("/sys/bus/event_source/devices/uncore_cha_{}/type", ncha);
+        //         LOG(DEBUG) << path;
         if (!std::filesystem::exists(path)) {
             break;
         }
     }
-    LOG(DEBUG) << fmt::format("num_of_cbo={}\n", ncbo);
-    return ncbo;
+    return ncha;
 }
 
-double Helper::cpu_frequency() const {
-    int i = 0;
-    int cpu = 0;
+double Helper::cpu_frequency() {
+    int i, c = 0;
     double cpu_mhz = 0.0;
     double max_cpu_mhz = 0.0;
     std::ifstream fp("/proc/cpuinfo");
 
-    for (std::string line; cpu != this->cpu - 1; std::getline(fp, line)) {
+    for (std::string line; c != this->num_of_cpu() - 1; std::getline(fp, line)) {
         // LOG(DEBUG) << fmt::format("line: {}\n", line);
         i = std::sscanf(line.c_str(), "cpu MHz : %lf", &cpu_mhz);
         max_cpu_mhz = i == 1 ? std::max(max_cpu_mhz, cpu_mhz) : max_cpu_mhz;
-        std::sscanf(line.c_str(), "processor : %d", &cpu);
+        std::sscanf(line.c_str(), "processor : %d", &c);
     }
     LOG(DEBUG) << fmt::format("cpu MHz: {}\n", cpu_mhz);
 
     return cpu_mhz;
 }
-PerfConfig Helper::detect_model(uint32_t model) {
+PerfConfig Helper::detect_model(uint32_t model, const std::vector<std::string> &perf_name,
+                                const std::vector<uint64_t> &perf_conf1, const std::vector<uint64_t> &perf_conf2) {
     int i = 0;
     LOG(INFO) << fmt::format("Detecting model...{}\n", model);
     while (model_ctx[i].model != CPU_MDL_END) {
         if (model_ctx[i].model == model) {
             this->perf_conf = model_ctx[i].perf_conf;
-            return model_ctx[i].perf_conf;
+            for (int j = 0; j < 4; ++j) {
+                this->perf_conf.cha[j] = std::make_tuple(perf_name[j], perf_conf1[j], perf_conf2[j]);
+            }
+            for (int j = 0; j < 4; ++j) {
+                this->perf_conf.cpu[j] = std::make_tuple(perf_name[j + 4], perf_conf1[j + 4], perf_conf2[j + 4]);
+            }
+            return this->perf_conf;
         }
         i++;
     }
-    LOG(ERROR) << "Failed to execute. This CPU model is not supported. Update src/types.c\n";
+    LOG(ERROR) << "Failed to execute. This CPU model is not supported. Refer to perfmon or pcm to add support\n";
     throw;
 }
-Helper::Helper() : perf_conf({}) {
+Helper::Helper() {
     cpu = num_of_cpu();
-    LOG(DEBUG) << cpu;
-    cbo = num_of_cbo();
-    cpu_freq = cpu_frequency();
+    cha = num_of_cha();
 }
 void Helper::noop_handler(int sig) { ; }
 void Helper::detach_children() {
-    struct sigaction sa;
+    struct sigaction sa {};
 
     sa.sa_handler = noop_handler;
     sigemptyset(&sa.sa_mask);
     sa.sa_flags = SA_RESTART | SA_NOCLDWAIT;
-    if (sigaction(SIGCHLD, &sa, NULL) < 0) {
+    if (sigaction(SIGCHLD, &sa, nullptr) < 0) {
         LOG(ERROR) << fmt::format("Failed to sigaction: %s", strerror(errno));
     }
 }
 int PMUInfo::start_all_pmcs() {
     /* enable all pmcs to count */
-    int i, r;
-    for (i = 0; i < helper->num_of_cpu(); i++) {
+    int r, i;
+    for (i = 0; i < this->cpus.size(); i++) {
         r = this->cpus[i].start();
         if (r < 0) {
             LOG(ERROR) << fmt::format("start failed. cpu:{}\n", i);
@@ -265,24 +107,19 @@ int PMUInfo::start_all_pmcs() {
     return 0;
 }
 PMUInfo::PMUInfo(pid_t pid, Helper *helper, struct PerfConfig *perf_config) : helper(helper) {
-    int i, r, n;
+    int r;
 
-    n = helper->num_of_cbo();
-
-    for (i = 0; i < n; i++) {
-        this->cbos.emplace_back(i, perf_config);
+    for (auto i : helper->used_cpu) {
+        this->chas.emplace_back(i, perf_config);
     }
-
     // unfreeze counters
-    r = this->unfreeze_counters_cbo_all();
+    r = this->unfreeze_counters_cha_all();
     if (r < 0) {
-        LOG(DEBUG) << fmt::format("unfreeze_counters_cbo_all failed.\n");
+        LOG(DEBUG) << fmt::format("unfreeze_counters_cha_all failed.\n");
         throw;
     }
 
-    n = helper->num_of_cpu();
-
-    for (i = 0; i < n; i++) {
+    for (auto i : helper->used_cpu) {
         this->cpus.emplace_back(pid, i, perf_config);
     }
 
@@ -295,7 +132,7 @@ int PMUInfo::stop_all_pmcs() {
     /* disable all pmcs to count */
     int i, r;
 
-    for (i = 0; i < helper->num_of_cpu(); i++) {
+    for (i = 0; i < this->cpus.size(); i++) {
         r = this->cpus[i].stop();
         if (r < 0) {
             LOG(ERROR) << fmt::format("stop failed. cpu:{}\n", i);
@@ -305,31 +142,36 @@ int PMUInfo::stop_all_pmcs() {
     return 0;
 }
 
-int PMUInfo::unfreeze_counters_cbo_all() {
+int PMUInfo::unfreeze_counters_cha_all() {
     int i, r;
 
-    for (i = 0; i < helper->num_of_cbo(); i++) {
-        r = this->cbos[i].perf->start();
-        if (r < 0) {
-            LOG(ERROR) << fmt::format("perf_start failed. cbo:{}\n", i);
-            return r;
+    for (i = 0; i < this->chas.size(); i++) {
+        for (int j : {0, 1, 2, 3}) {
+            r = this->chas[i].perf[j]->start();
+            if (r < 0) {
+                LOG(ERROR) << fmt::format("perf_start failed. cha:{}\n", i);
+                return r;
+            }
         }
     }
     return 0;
 }
-int PMUInfo::freeze_counters_cbo_all() {
+int PMUInfo::freeze_counters_cha_all() {
     int i, r;
 
-    for (i = 0; i < helper->num_of_cbo(); i++) {
-        r = this->cbos[i].perf->stop();
-        if (r < 0) {
-            LOG(ERROR) << fmt::format("perf_stop failed. cbo:{}\n", i);
-            return r;
+    for (i = 0; i < this->chas.size(); i++) {
+        for (int j : {0, 1, 2, 3}) {
+            r = this->chas[i].perf[j]->stop();
+            if (r < 0) {
+                LOG(ERROR) << fmt::format("perf_stop failed. cha:{}\n", i);
+                return r;
+            }
         }
     }
     return 0;
 }
 PMUInfo::~PMUInfo() {
     this->cpus.clear();
-    this->cbos.clear();
+    this->chas.clear();
+    stop_all_pmcs();
 }
diff --git a/src/incore.cpp b/src/incore.cpp
index accda65..c2c39ab 100644
--- a/src/incore.cpp
+++ b/src/incore.cpp
@@ -4,7 +4,7 @@
 
 #include "incore.h"
 #include "helper.h"
-
+extern Helper helper;
 void pcm_cpuid(const unsigned leaf, CPUID_INFO *info) {
     __asm__ __volatile__("cpuid"
                          : "=a"(info->reg.eax), "=b"(info->reg.ebx), "=c"(info->reg.ecx), "=d"(info->reg.edx)
@@ -14,7 +14,7 @@ void pcm_cpuid(const unsigned leaf, CPUID_INFO *info) {
 int Incore::start() {
     int i, r = -1;
 
-    for (i = 0; i < 4; i++) {
+    for (i = 0; i < this->perf.size(); i++) {
         r = this->perf[i]->start();
         if (r < 0) {
             LOG(ERROR) << fmt::format("perf_start failed. i:{}\n", i);
@@ -26,7 +26,7 @@ int Incore::start() {
 int Incore::stop() {
     int i, r = -1;
 
-    for (i = 0; i < 4; i++) {
+    for (i = 0; i < this->perf.size(); i++) {
         r = this->perf[i]->stop();
         if (r < 0) {
             LOG(ERROR) << fmt::format("perf_stop failed. i:{}\n", i);
@@ -35,95 +35,35 @@ int Incore::stop() {
     }
     return r;
 }
-void Incore::init_all_dram_rds(const pid_t pid, const int cpu) {
-    this->perf[0] = init_incore_perf(pid, cpu, perf_config->all_dram_rds_config, perf_config->all_dram_rds_config1);
-}
-void Incore::init_cpu_mem_read(const pid_t pid, const int cpu) {
-    this->perf[0] = init_incore_perf(pid, cpu, perf_config->cpu_bandwidth_read_config, 0);
-}
-void Incore::init_cpu_l2stall(const pid_t pid, const int cpu) {
-    this->perf[1] = init_incore_perf(pid, cpu, perf_config->cpu_l2stall_config, 0);
-}
-void Incore::init_cpu_llcl_hits(const pid_t pid, const int cpu) {
-    this->perf[2] = init_incore_perf(pid, cpu, perf_config->cpu_llcl_hits_config, 0);
-}
-void Incore::init_cpu_llcl_miss(const pid_t pid, const int cpu) {
-    this->perf[3] = init_incore_perf(pid, cpu, perf_config->cpu_llcl_miss_config, 0);
-}
-void Incore::init_cpu_mem_write(const pid_t pid, const int cpu) {
-    this->perf[5] = init_incore_perf(pid, cpu, perf_config->cpu_bandwidth_write_config, 0);
-}
-void Incore::init_cpu_ebpf(const pid_t pid, const int cpu) {
-    if (cpu == 0)
-        this->perf[4] = init_incore_bpf_perf(pid, cpu);
-    else
-        this->perf[4] = nullptr;
-}
-int Incore::read_cpu_elems(struct CPUElem *elem) {
-    ssize_t r;
-
-    r = this->perf[0]->read_pmu(&elem->cpu_bandwidth_read);
-    if (r < 0) {
-        LOG(ERROR) << fmt::format("read cpu_bandwidth_read failed.\n");
-        return r;
-    }
-    LOG(DEBUG) << fmt::format("read cpu_bandwidth_read:{}\n", elem->cpu_bandwidth_read);
 
-    r = this->perf[1]->read_pmu(&elem->cpu_l2stall_t);
-    if (r < 0) {
-        LOG(ERROR) << fmt::format("read cpu_l2stall_t failled.\n");
-        return r;
-    }
-    LOG(DEBUG) << fmt::format("read cpu_l2stall_t:{}\n", elem->cpu_l2stall_t);
-
-    r = this->perf[2]->read_pmu(&elem->cpu_llcl_hits);
-    if (r < 0) {
-        LOG(ERROR) << fmt::format("read cpu_llcl_hits failed.\n");
-        return r;
+ssize_t Incore::read_cpu_elems(struct CPUElem *elem) {
+    ssize_t r;
+    for (auto const &[idx, value] : this->perf | enumerate) {
+        r = value->read_pmu(&elem->cpu[idx]);
+        if (r < 0) {
+            LOG(ERROR) << fmt::format("read cpu_elems[{}] failed.\n", std::get<0>(helper.perf_conf.cha[idx]));
+            return r;
+        }
+        LOG(DEBUG) << fmt::format("read cpu_elems[{}]:{}\n", std::get<0>(helper.perf_conf.cpu[idx]), elem->cpu[idx]);
     }
-    LOG(DEBUG) << fmt::format("read cpu_llcl_hits:{}\n", elem->cpu_llcl_hits);
 
-    r = this->perf[3]->read_pmu(&elem->cpu_llcl_miss);
-    if (r < 0) {
-        LOG(ERROR) << fmt::format("read cpu_llcl_miss failed.\n");
-        return r;
-    }
-    LOG(DEBUG) << fmt::format("read cpu_llcl_miss:{}\n", elem->cpu_llcl_miss);
-
-    // r = this->perf[4]->read_pmu(&elem->cpu_bandwidth_read);
-    // if (r < 0) {
-    //     LOG(ERROR) << fmt::format("read cpu_bandwidth_read failed.\n");
-    //     return r;
-    // }
-    // LOG(DEBUG) << fmt::format("read cpu_bandwidth_read:{}\n", elem->cpu_bandwidth_read);
-    // r = this->perf[5]->read_pmu(&elem->cpu_bandwidth_write);
-    // if (r < 0) {
-    //     LOG(ERROR) << fmt::format("read cpu_bandwidth_write failed.\n");
-    //     return r;
-    // }
-    // LOG(DEBUG) << fmt::format("read cpu_bandwidth_write:{}\n", elem->cpu_bandwidth_write);
-    if (this->perf[4] != nullptr) {
-        elem->cpu_munmap_address_length = this->perf[4]->read_trace_pipe();
-        LOG(DEBUG) << "read munmap result with size:" << elem->cpu_munmap_address_length.size() << "\n";
-    }
+    return 0;
 }
+
 Incore::Incore(const pid_t pid, const int cpu, struct PerfConfig *perf_config) : perf_config(perf_config) {
     /* reset all pmc values */
-    // this->init_all_dram_rds(pid, cpu);
-    this->init_cpu_mem_read(pid, cpu);
-    this->init_cpu_l2stall(pid, cpu);
-    this->init_cpu_llcl_hits(pid, cpu);
-    this->init_cpu_llcl_miss(pid, cpu);
-    this->init_cpu_ebpf(pid, cpu);
-    // this->init_cpu_mem_write(pid, cpu);
+    for (int i = 0; i < perf_config->cpu.size(); i++) {
+        this->perf[i] = init_incore_perf(pid, cpu, std::get<1>(perf_config->cpu[i]), std::get<2>(perf_config->cpu[i]));
+    }
 }
+
 bool get_cpu_info(struct CPUInfo *cpu_info) {
     char buffer[1024];
     union {
         char cbuf[16];
         int ibuf[16 / sizeof(int)];
-    } buf;
-    CPUID_INFO cpuinfo;
+    } buf{};
+    CPUID_INFO cpuinfo{};
 
     pcm_cpuid(0, &cpuinfo);
 
@@ -133,7 +73,7 @@ bool get_cpu_info(struct CPUInfo *cpu_info) {
     buf.ibuf[1] = cpuinfo.array[3];
     buf.ibuf[2] = cpuinfo.array[2];
 
-    if (strncmp(buf.cbuf, "GenuineIntel", 4 * 3) != 0) {
+    if (strncmp(buf.cbuf, "GenuineIntel", 12) != 0) {
         LOG(ERROR) << fmt::format("We only Support Intel CPU\n");
         return false;
     }
diff --git a/src/logging.cpp b/src/logging.cpp
index 3ae18ec..83bc062 100644
--- a/src/logging.cpp
+++ b/src/logging.cpp
@@ -3,11 +3,17 @@
 //
 
 #include "logging.h"
+#include <cstddef>
+#include <iostream>
 
 void LogWriter::operator<(const LogStream &stream) {
     std::ostringstream msg;
-    msg << stream.sstream_->rdbuf();
-    output_log(msg);
+    if (log_level_ == TRACE)
+        file_ << stream.sstream_->rdbuf();
+    else {
+        msg << stream.sstream_->rdbuf();
+        output_log(msg);
+    }
 }
 
 void LogWriter::output_log(const std::ostringstream &msg) {
diff --git a/src/main.cpp b/src/main.cc
similarity index 56%
rename from src/main.cpp
rename to src/main.cc
index 3c38dde..59f2efa 100644
--- a/src/main.cpp
+++ b/src/main.cc
@@ -1,50 +1,56 @@
 //
 // Created by victoryang00 on 1/12/23.
 //
+
 #include "cxlendpoint.h"
 #include "helper.h"
-#include "logging.h"
 #include "monitor.h"
 #include "policy.h"
+#include "sock.h"
 #include <cerrno>
 #include <cmath>
-#include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #include <ctime>
 #include <cxxopts.hpp>
-#include <range/v3/view.hpp>
 #include <sys/poll.h>
 #include <sys/socket.h>
 #include <sys/un.h>
 #include <unistd.h>
 
-#define SOCKET_PATH "/tmp/cxl_mem_simulator.sock"
-
+Helper helper{};
 int main(int argc, char *argv[]) {
-
-    cxxopts::Options options("CXL-MEM-Simulator",
-                             "For simulation of CXL.mem Type 3 on Broadwell, Skylake, and Saphire Rapids");
+    cxxopts::Options options("CXLMemSim", "For simulation of CXL.mem Type 3 on Sapphire Rapids");
     options.add_options()("t,target", "The script file to execute",
-                          cxxopts::value<std::string>()->default_value("./microbench/many_calloc"))(
-        "h,help", "The value for epoch value", cxxopts::value<bool>()->default_value("false"))(
-        "i,interval", "The value for epoch value", cxxopts::value<int>()->default_value("5"))(
-        "c,cpuset", "The CPUSET for CPU to set affinity on",
-        cxxopts::value<std::vector<int>>()->default_value("0,1,2,3,4,5,6,7,8,9,10,11,12,13,14"))(
-        "d,dramlatency", "The current platform's dram latency", cxxopts::value<double>()->default_value("85"))(
-        "p,pebsperiod", "The pebs sample period", cxxopts::value<int>()->default_value("1"))(
+                          cxxopts::value<std::string>()->default_value("./microbench/ld_simple"))(
+        "h,help", "Help for CXLMemSim", cxxopts::value<bool>()->default_value("false"))(
+        "i,interval", "The value for epoch value", cxxopts::value<int>()->default_value("1000"))(
+        "s,source", "Collection Phase or Validation Phase", cxxopts::value<bool>()->default_value("false"))(
+        "c,cpuset", "The CPUSET for CPU to set affinity on and only run the target process on those CPUs",
+        cxxopts::value<std::vector<int>>()->default_value("0"))("d,dramlatency", "The current platform's dram latency",
+                                                                cxxopts::value<double>()->default_value("110"))(
+        "p,pebsperiod", "The pebs sample period", cxxopts::value<int>()->default_value("100"))(
         "m,mode", "Page mode or cacheline mode", cxxopts::value<std::string>()->default_value("p"))(
         "o,topology", "The newick tree input for the CXL memory expander topology",
         cxxopts::value<std::string>()->default_value("(1,(2,3))"))(
-        "s,capacity", "The capacity vector of the CXL memory expander with the firsgt local",
+        "e,capacity", "The capacity vector of the CXL memory expander with the firsgt local",
         cxxopts::value<std::vector<int>>()->default_value("0,20,20,20"))(
         "f,frequency", "The frequency for the running thread", cxxopts::value<double>()->default_value("4000"))(
         "l,latency", "The simulated latency by epoch based calculation for injected latency",
         cxxopts::value<std::vector<int>>()->default_value("100,150,100,150,100,150"))(
-        "w,weight", "The simulated weight for multiplying with the LLC miss",
-        cxxopts::value<double>()->default_value("4.1"))(
         "b,bandwidth", "The simulated bandwidth by linear regression",
-        cxxopts::value<std::vector<int>>()->default_value("50,50,50,50,50,50"));
+        cxxopts::value<std::vector<int>>()->default_value("50,50,50,50,50,50"))(
+        "x,pmu_name", "The input for Collected PMU",
+        cxxopts::value<std::vector<std::string>>()->default_value(
+            "tatal_stall,all_dram_rds,l2stall,snoop_fw_wb,llcl_hits,llcl_miss,null,null"))(
+        "y,pmu_config1", "The config0 for Collected PMU",
+        cxxopts::value<std::vector<uint64_t>>()->default_value("0x04004a3,0x01b7,0x05005a3,0x205c,0x08d2,0x01d3,0,0"))(
+        "z,pmu_config2", "The config1 for Collected PMU",
+        cxxopts::value<std::vector<uint64_t>>()->default_value("0,0x63FC00491,0,0,0,0,0,0"))(
+        "w,weight", "The weight for Linear Regression",
+        cxxopts::value<std::vector<double>>()->default_value("88, 88, 88, 88, 88, 88, 88"))(
+        "v,weight_vec", "The weight vector for Linear Regression",
+        cxxopts::value<std::vector<double>>()->default_value("400, 800, 1200, 1600, 2000, 2400, 3000"));
 
     auto result = options.parse(argc, argv);
     if (result["help"].as<bool>()) {
@@ -56,31 +62,51 @@ int main(int argc, char *argv[]) {
     auto cpuset = result["cpuset"].as<std::vector<int>>();
     auto pebsperiod = result["pebsperiod"].as<int>();
     auto latency = result["latency"].as<std::vector<int>>();
-    auto weight = result["weight"].as<double>();
     auto bandwidth = result["bandwidth"].as<std::vector<int>>();
     auto frequency = result["frequency"].as<double>();
     auto topology = result["topology"].as<std::string>();
     auto capacity = result["capacity"].as<std::vector<int>>();
     auto dramlatency = result["dramlatency"].as<double>();
-    auto mode = result["mode"].as<std::string>() == "p" ? true : false;
-    Helper helper{};
-    InterleavePolicy *policy = new InterleavePolicy();
+    auto pmu_name = result["pmu_name"].as<std::vector<std::string>>();
+    auto pmu_config1 = result["pmu_config1"].as<std::vector<uint64_t>>();
+    auto pmu_config2 = result["pmu_config2"].as<std::vector<uint64_t>>();
+    auto weight = result["weight"].as<std::vector<double>>();
+    auto weight_vec = result["weight_vec"].as<std::vector<double>>();
+    auto source = result["source"].as<bool>();
+    enum page_type mode;
+    if (result["mode"].as<std::string>() == "hugepage_2M") {
+        mode = page_type::HUGEPAGE_2M;
+    } else if (result["mode"].as<std::string>() == "hugepage_1G") {
+        mode = page_type::HUGEPAGE_1G;
+    } else if (result["mode"].as<std::string>() == "cacheline") {
+        mode = page_type::CACHELINE;
+    } else {
+        mode = page_type::PAGE;
+    }
+
+    auto *policy = new InterleavePolicy();
     CXLController *controller;
+
     uint64_t use_cpus = 0;
     cpu_set_t use_cpuset;
     CPU_ZERO(&use_cpuset);
-    for (int i = 0; i < helper.cpu; i++) {
+    for (auto i : cpuset) {
         if (!use_cpus || use_cpus & 1UL << i) {
             CPU_SET(i, &use_cpuset);
-            LOG(DEBUG) << fmt::format("use cpuid: {}{}\n", i, use_cpus); /** TODO: set CAT here */
+            LOG(DEBUG) << fmt::format("use cpuid: {}{}\n", i, use_cpus);
         }
     }
+
     auto tnum = CPU_COUNT(&use_cpuset);
     auto cur_processes = 0;
-    auto ncpu = helper.cpu;
-    auto ncbo = helper.cbo;
-    LOG(DEBUG) << fmt::format("tnum:{}, intrval:{}, weight:{}\n", tnum, interval, weight);
-    for (auto const &[idx, value] : capacity | ranges::views::enumerate) {
+    auto ncpu = helper.num_of_cpu();
+    auto ncha = helper.num_of_cha();
+    LOG(DEBUG) << fmt::format("tnum:{}, intrval:{}\n", tnum, interval);
+    for (auto const &[idx, value] : weight | enumerate) {
+        LOG(DEBUG) << fmt::format("weight[{}]:{}\n", weight_vec[idx], value);
+    }
+
+    for (auto const &[idx, value] : capacity | enumerate) {
         if (idx == 0) {
             LOG(DEBUG) << fmt::format("local_memory_region capacity:{}\n", value);
             controller = new CXLController(policy, capacity[0], mode, interval);
@@ -101,67 +127,62 @@ int main(int argc, char *argv[]) {
     int sock;
     struct sockaddr_un addr {};
 
+    /** Hove been got by socket if it's not main thread and synchro */
     sock = socket(AF_UNIX, SOCK_DGRAM, 0);
     addr.sun_family = AF_UNIX;
     strcpy(addr.sun_path, SOCKET_PATH);
     remove(addr.sun_path);
-    if (bind(sock, (struct sockaddr *)&addr, sizeof(addr)) == -1) {
+    if (bind(sock, (struct sockaddr *)&addr, sizeof(addr)) == -1) { // can be blocked for multi thread
         LOG(ERROR) << "Failed to execute. Can't bind to a socket.\n";
         exit(1);
     }
+
+    size_t sock_buf_size = sizeof(op_data) + 1;
+    char *sock_buf = (char *)malloc(sock_buf_size);
+
     LOG(DEBUG) << fmt::format("cpu_freq:{}\n", frequency);
-    LOG(DEBUG) << fmt::format("num_of_cbo:{}\n", ncbo);
+    LOG(DEBUG) << fmt::format("num_of_cha:{}\n", ncha);
     LOG(DEBUG) << fmt::format("num_of_cpu:{}\n", ncpu);
-    Monitors monitors{tnum, &use_cpuset, static_cast<int>(capacity.size()) - 1, helper};
+    for (auto j : cpuset) {
+        helper.used_cpu.push_back(cpuset[j]);
+        helper.used_cha.push_back(cpuset[j]);
+    }
+    Monitors monitors{tnum, &use_cpuset};
 
-    // https://stackoverflow.com/questions/24796266/tokenizing-a-string-to-pass-as-char-into-execve
+    /** Reinterpret the input for the argv argc */
     char cmd_buf[1024] = {0};
     strncpy(cmd_buf, target.c_str(), sizeof(cmd_buf));
-
-    /* This strtok_r() call puts '\0' after the first token in the buffer,
-     * It saves the state to the strtok_state and subsequent calls resume from that point. */
     char *strtok_state = nullptr;
     char *filename = strtok_r(cmd_buf, " ", &strtok_state);
-
-    /* Allocate an array of pointers.
-     * We will make them point to certain locations inside the cmd_buf. */
     char *args[32] = {nullptr};
     args[0] = filename;
-    /* loop the strtok_r() call while there are tokens and free space in the array */
     size_t current_arg_idx;
     for (current_arg_idx = 1; current_arg_idx < 32; ++current_arg_idx) {
-        /* Note that the first argument to strtok_r() is nullptr.
-         * That means resume from a point saved in the strtok_state. */
         char *current_arg = strtok_r(nullptr, " ", &strtok_state);
         if (current_arg == nullptr) {
             break;
         }
-
         args[current_arg_idx] = current_arg;
         LOG(INFO) << fmt::format("args[{}] = {}\n", current_arg_idx, args[current_arg_idx]);
     }
 
-    /* zombie avoid */
+    /** Create target process */
     Helper::detach_children();
-    /* create target process */
     auto t_process = fork();
     if (t_process < 0) {
         LOG(ERROR) << "Fork: failed to create target process";
         exit(1);
     } else if (t_process == 0) {
-        execv(filename, args);
-        /* We do not need to check the return value */
+        execv(filename, args); // taskset in lpace
         LOG(ERROR) << "Exec: failed to create target process\n";
         exit(1);
     }
-    /** TODO: bind the rest of core in 0-7 and affine the CXL Simulator to 8 */
-    // In case of process, use SIGSTOP.
-    auto res = monitors.enable(t_process, t_process, true, pebsperiod, tnum, mode);
+    /** In case of process, use SIGSTOP. */
+    auto res = monitors.enable(t_process, t_process, true, pebsperiod, tnum);
     if (res == -1) {
         LOG(ERROR) << fmt::format("Failed to enable monitor\n");
         exit(0);
     } else if (res < 0) {
-        // pid not found. might be already terminated.
         LOG(DEBUG) << fmt::format("pid({}) not found. might be already terminated.\n", t_process);
     }
     cur_processes++;
@@ -174,53 +195,117 @@ int main(int argc, char *argv[]) {
         exit(0);
     }
 
-    // Wait all the target processes until emulation process initialized.
+    /** Wait all the target processes until emulation process initialized. */
     monitors.stop_all(cur_processes);
 
-    /* get CPU information */
+    /** Get CPU information */
     if (!get_cpu_info(&monitors.mon[0].before->cpuinfo)) {
         LOG(DEBUG) << "Failed to obtain CPU information.\n";
     }
-
-    /* check the CPU model */
-    auto perf_config = helper.detect_model(monitors.mon[0].before->cpuinfo.cpu_model);
-
+    auto perf_config =
+        helper.detect_model(monitors.mon[0].before->cpuinfo.cpu_model, pmu_name, pmu_config1, pmu_config2);
     PMUInfo pmu{t_process, &helper, &perf_config};
 
-    /* Caculate epoch time */
+    /*% Caculate epoch time */
     struct timespec waittime {};
     waittime.tv_sec = interval / 1000;
     waittime.tv_nsec = (interval % 1000) * 1000000;
 
     LOG(DEBUG) << "The target process starts running.\n";
     LOG(DEBUG) << fmt::format("set nano sec = {}\n", waittime.tv_nsec);
+    LOG(TRACE) << fmt::format("{}\n", monitors);
+    monitors.print_flag = false;
 
-    /* read CBo params */
-    for (auto mon : monitors.mon) {
-        for (auto const &[idx, value] : pmu.cbos | ranges::views::enumerate) {
-            pmu.cbos[idx].read_cbo_elems(&mon.before->cbos[idx]);
+    /* read CHA params */
+    for (const auto &mon : monitors.mon) {
+        for (auto const &[idx, value] : pmu.chas | enumerate) {
+            pmu.chas[idx].read_cha_elems(&mon.before->chas[idx]);
         }
-        for (auto const &[idx, value] : pmu.cpus | ranges::views::enumerate) {
+        for (auto const &[idx, value] : pmu.cpus | enumerate) {
             pmu.cpus[idx].read_cpu_elems(&mon.before->cpus[idx]);
         }
     }
 
     uint32_t diff_nsec = 0;
-    struct timespec start_ts, end_ts;
-    struct timespec sleep_start_ts, sleep_end_ts;
+    struct timespec start_ts {
+    }, end_ts{};
+    struct timespec sleep_start_ts {
+    }, sleep_end_ts{};
 
-    // Wait all the target processes until emulation process initialized.
+    /** Wait all the target processes until emulation process initialized. */
     monitors.run_all(cur_processes);
     for (int i = 0; i < cur_processes; i++) {
         clock_gettime(CLOCK_MONOTONIC, &monitors.mon[i].start_exec_ts);
     }
 
     while (true) {
+        /** Get from the CXLMemSimHook */
+        int n;
+        do {
+            memset(sock_buf, 0, sock_buf_size);
+            // without blocking
+            n = recv(sock, sock_buf, sock_buf_size, MSG_DONTWAIT);
+            if (n < 1) {
+                if (errno == EAGAIN || errno == EWOULDBLOCK) {
+                    // no data
+                    break;
+                } else {
+                    LOG(ERROR) << "Failed to recv";
+                    exit(-1);
+                }
+            } else if (n >= sizeof(struct op_data) && n <= sock_buf_size - 1) {
+                auto *opd = (struct op_data *)sock_buf;
+                LOG(ERROR) << fmt::format("received data: size={}, tgid={}, tid=[], opcode={}\n", n, opd->tgid,
+                                          opd->tid, opd->opcode);
+
+                if (opd->opcode == CXLMEMSIM_THREAD_CREATE || opd->opcode == CXLMEMSIM_PROCESS_CREATE) {
+                    int t;
+                    bool is_process = opd->opcode == CXLMEMSIM_PROCESS_CREATE;
+                    // register to monitor
+
+                    t = monitors.enable(opd->tgid, opd->tid, is_process, pebsperiod, tnum);
+                    if (t == -1) {
+                        LOG(ERROR) << "Failed to enable monitor\n";
+                    } else if (t < 0) {
+                        // tid not found. might be already terminated.
+                        continue;
+                    }
+                    auto mon = monitors.mon[t];
+                    // Wait the t processes until emulation process initialized.
+                    mon.stop();
+                    /* read CHA params */
+                    for (auto const &[idx, value] : pmu.chas | enumerate) {
+                        pmu.chas[idx].read_cha_elems(&mon.before->chas[idx]);
+                    }
+                    for (auto const &[idx, value] : pmu.chas | enumerate) {
+                        pmu.chas[idx].read_cha_elems(&mon.before->chas[idx]);
+                    }
+                    // Run the t processes.
+                    mon.run();
+                    clock_gettime(CLOCK_MONOTONIC, &mon.start_exec_ts);
+                } else if (opd->opcode == CXLMEMSIM_THREAD_EXIT) {
+                    // unregister from monitor, and display results.
+                    // get the tid from the tgid
+                    auto mon = monitors.get_mon(opd->tgid, opd->tid);
+                    mon.stop();
+                } else if (opd->opcode == CXLMEMSIM_STABLE_SIGNAL) {
+                    for (auto const &[i, mon] : monitors.mon | enumerate) {
+                        if (mon.status == MONITOR_ON) {
+                            mon.stop();
+                            mon.status = MONITOR_SUSPEND;
+                        }
+                    }
+                }
+
+            } else {
+                LOG(ERROR) << fmt::format("received data is invalid size: size={}", n);
+            }
+        } while (n > 0); // check the next message.
+
         /* wait for pre-defined interval */
         clock_gettime(CLOCK_MONOTONIC, &sleep_start_ts);
 
         /** Here was a definition for the multi process and thread to enable multiple monitor */
-
         struct timespec req = waittime;
         struct timespec rem = {0};
         while (true) {
@@ -242,53 +327,61 @@ int main(int argc, char *argv[]) {
                 }
             }
         }
-        clock_gettime(CLOCK_MONOTONIC, &sleep_end_ts);
 
-        for (auto const &[i, mon] : monitors.mon | ranges::views::enumerate) {
+        uint64_t calibrated_delay;
+        for (auto const &[i, mon] : monitors.mon | enumerate) {
+            // check other process
             if (mon.status == MONITOR_DISABLE) {
                 continue;
             }
-            if (mon.status == MONITOR_ON) {
+            if (mon.status == MONITOR_ON || mon.status == MONITOR_SUSPEND) {
                 clock_gettime(CLOCK_MONOTONIC, &start_ts);
                 LOG(DEBUG) << fmt::format("[{}:{}:{}] start_ts: {}.{}\n", i, mon.tgid, mon.tid, start_ts.tv_sec,
                                           start_ts.tv_nsec);
                 mon.stop();
-                /* read CBo values */
+                /** Read CHA values */
                 uint64_t wb_cnt = 0;
-                for (int j = 0; j < ncbo; j++) {
-                    pmu.cbos[j].read_cbo_elems(&mon.after->cbos[j]);
-                    wb_cnt += mon.after->cbos[j].llc_wb - mon.before->cbos[j].llc_wb;
+                std::vector<uint64_t> cha_vec, cpu_vec{};
+                // for (int j = 0; j < ncha; j++) {
+                //     pmu.chas[j].read_cha_elems(&mon.after->chas[j]);
+                //     wb_cnt += mon.after->chas[j].cpu_llc_wb - mon.before->chas[j].cpu_llc_wb;
+                // }
+                // LOG(INFO) << fmt::format("[{}:{}:{}] LLC_WB = {}\n", i, mon.tgid, mon.tid, wb_cnt);
+                // }
+                for (int j = 0; j < helper.used_cha.size(); j++) {
+                    for (auto const &[idx, value] : pmu.chas | enumerate) {
+                        value.read_cha_elems(&mon.after->chas[j]);
+                        cha_vec.emplace_back(mon.after->chas[j].cha[idx] - mon.before->chas[j].cha[idx]);
+                    }
                 }
-                LOG(INFO) << fmt::format("[{}:{}:{}] LLC_WB = {}\n", i, mon.tgid, mon.tid, wb_cnt);
-
-                /* read CPU params */
+                /*** read CPU params */
                 uint64_t read_config = 0;
                 uint64_t target_l2stall = 0, target_llcmiss = 0, target_llchits = 0;
-                for (int j = 0; j < ncpu; ++j) {
-                    pmu.cpus[j].read_cpu_elems(&mon.after->cpus[j]);
-                    if (pmu.cpus[j].perf[4] != nullptr) {
-                        for (auto &i : mon.after->cpus[j].cpu_munmap_address_length) { // delete by ebpf
-                            LOG(DEBUG) << fmt::format("munmap address:{}, length:{}\n", i.first, i.second);
-                            controller->delete_entry(i.first, i.second);
-                        }
-                    }
-                    read_config += mon.after->cpus[j].cpu_bandwidth_read - mon.before->cpus[j].cpu_bandwidth_read;
-                }
+                // for (int j = 0; j < ncpu; ++j) {
+                //     pmu.cpus[j].read_cpu_elems(&mon.after->cpus[j]);
+                //     read_config += mon.after->cpus[j].cpu_bandwidth - mon.before->cpus[j].cpu_bandwidth;
+                // }
                 /* read PEBS sample */
                 if (mon.pebs_ctx->read(controller, &mon.after->pebs) < 0) {
                     LOG(ERROR) << fmt::format("[{}:{}:{}] Warning: Failed PEBS read\n", i, mon.tgid, mon.tid);
                 }
-                target_llcmiss = mon.after->pebs.total - mon.before->pebs.total;
+                // target_llcmiss = mon.after->pebs.total - mon.before->pebs.total;
 
                 // target_l2stall =
                 //     mon.after->cpus[mon.cpu_core].cpu_l2stall_t - mon.before->cpus[mon.cpu_core].cpu_l2stall_t;
                 // target_llchits =
                 //     mon.after->cpus[mon.cpu_core].cpu_llcl_hits - mon.before->cpus[mon.cpu_core].cpu_llcl_hits;
-                for (auto const &[idx, value] : pmu.cpus | ranges::views::enumerate) {
-                    target_l2stall += mon.after->cpus[idx].cpu_l2stall_t - mon.before->cpus[idx].cpu_l2stall_t;
-                    target_llchits += mon.after->cpus[idx].cpu_llcl_hits - mon.before->cpus[idx].cpu_llcl_hits;
+                //  for (auto const &[idx, value] : pmu.cpus | enumerate) {
+                //      target_l2stall += mon.after->cpus[idx].cpu_l2stall_t - mon.before->cpus[idx].cpu_l2stall_t;
+                //      target_llchits += mon.after->cpus[idx].cpu_llcl_hits - mon.before->cpus[idx].cpu_llcl_hits;
+                //  }
+                for (int j = 0; j < helper.used_cpu.size(); j++) {
+                    for (auto const &[idx, value] : pmu.cpus | enumerate) {
+                        value.read_cpu_elems(&mon.after->cpus[j]);
+                        //                        wb_cnt = mon.after->cpus[j].cpu[idx] - mon.before->cpus[j].cpu[idx];
+                        cpu_vec.emplace_back(mon.after->cpus[j].cpu[idx] - mon.before->cpus[j].cpu[idx]);
+                    }
                 }
-
                 uint64_t llcmiss_wb = 0;
                 // To estimate the number of the writeback-involving LLC
                 // misses of the CPU core (llcmiss_wb), the total number of
@@ -297,10 +390,10 @@ int main(int argc, char *argv[]) {
                 // the LLC misses of the CPU core (target_llcmiss) to that
                 // of the LLC misses of all the CPU cores and the
                 // prefetchers (cpus_dram_rds).
-                llcmiss_wb = wb_cnt * ((double)target_llcmiss / read_config);
-
+                // llcmiss_wb = wb_cnt * std::lround(((double)target_llcmiss) / ((double)read_config));
+                // TODO Calculate through the vector !!! target latency
                 uint64_t llcmiss_ro = 0;
-                if (target_llcmiss < llcmiss_wb) {
+                if (target_llcmiss < llcmiss_wb) { // tunning
                     LOG(ERROR) << fmt::format("[{}:{}:{}] cpus_dram_rds {}, llcmiss_wb {}, target_llcmiss {}\n", i,
                                               mon.tgid, mon.tid, read_config, llcmiss_wb, target_llcmiss);
                     llcmiss_wb = target_llcmiss;
@@ -311,23 +404,6 @@ int main(int argc, char *argv[]) {
                 LOG(DEBUG) << fmt::format("[{}:{}:{}]llcmiss_wb={}, llcmiss_ro={}\n", i, mon.tgid, mon.tid, llcmiss_wb,
                                           llcmiss_ro);
 
-                uint64_t mastall_wb = 0;
-                uint64_t mastall_ro = 0;
-                // If both target_llchits and target_llcmiss are 0, it means that hit in L2.
-                // Stall by LLC misses is 0.
-                mastall_wb = (double)(target_l2stall / frequency) *
-                             ((double)(weight * llcmiss_wb) / (double)(target_llchits + (weight * target_llcmiss))) *
-                             1000;
-                mastall_ro = (double)(target_l2stall / frequency) *
-                             ((double)(weight * llcmiss_ro) / (double)(target_llchits + (weight * target_llcmiss))) *
-                             1000;
-                LOG(DEBUG) << fmt::format(
-                    "l2stall={}, mastall_wb={}, mastall_ro={}, target_llchits={}, target_llcmiss={}, weight={}\n",
-                    target_l2stall, mastall_wb, mastall_ro, target_llchits, target_llcmiss, weight);
-
-                auto ma_wb = (double)mastall_wb / dramlatency;
-                auto ma_ro = (double)mastall_ro / dramlatency;
-
                 uint64_t emul_delay = 0;
 
                 LOG(DEBUG) << fmt::format("[{}:{}:{}] pebs: total={}, \n", i, mon.tgid, mon.tid, mon.after->pebs.total);
@@ -338,65 +414,53 @@ int main(int argc, char *argv[]) {
                 LatencyPass lat_pass = {
                     .all_access = all_access,
                     .dramlatency = dramlatency,
-                    .ma_ro = ma_ro,
-                    .ma_wb = ma_wb,
+                    .readonly = llcmiss_ro,
+                    .writeback = llcmiss_wb,
                 };
                 BandwidthPass bw_pass = {
                     .all_access = all_access,
                     .read_config = read_config,
                     .write_config = read_config,
                 };
-                emul_delay += controller->calculate_latency(lat_pass);
+                emul_delay += std::lround(controller->calculate_latency(lat_pass));
                 emul_delay += controller->calculate_bandwidth(bw_pass);
                 emul_delay += std::get<0>(controller->calculate_congestion());
 
                 mon.before->pebs.total = mon.after->pebs.total;
 
-                LOG(DEBUG) << fmt::format("ma_wb={}, ma_ro={}, delay={}\n", ma_wb, ma_ro, emul_delay);
+                LOG(DEBUG) << fmt::format("delay={}\n", emul_delay);
 
                 /* compensation of delay END(1) */
                 clock_gettime(CLOCK_MONOTONIC, &end_ts);
                 diff_nsec += (end_ts.tv_sec - start_ts.tv_sec) * 1000000000 + (end_ts.tv_nsec - start_ts.tv_nsec);
                 LOG(DEBUG) << fmt::format("dif:{}\n", diff_nsec);
 
-                uint64_t calibrated_delay = (diff_nsec > emul_delay) ? 0 : emul_delay - diff_nsec;
-                // uint64_t calibrated_delay = emul_delay;
+                calibrated_delay = (diff_nsec > emul_delay) ? 0 : emul_delay - diff_nsec;
                 mon.total_delay += (double)calibrated_delay / 1000000000;
                 diff_nsec = 0;
 
                 /* insert emulated NVM latency */
-                mon.injected_delay.tv_sec += (calibrated_delay / 1000000000);
-                mon.injected_delay.tv_nsec += (calibrated_delay % 1000000000);
+                mon.injected_delay.tv_sec += std::lround(calibrated_delay / 1000000000);
+                mon.injected_delay.tv_nsec += std::lround(calibrated_delay % 1000000000);
                 LOG(DEBUG) << fmt::format("[{}:{}:{}]delay:{} , total delay:{}\n", i, mon.tgid, mon.tid,
                                           calibrated_delay, mon.total_delay);
-                auto swap = mon.before;
-                mon.before = mon.after;
-                mon.after = swap;
 
-                /* continue suspended processes: send SIGCONT */
-                // unfreeze_counters_cbo_all(fds.msr[0]);
-                // start_pmc(&fds, i);
-                if (calibrated_delay == 0) {
-                    mon.clear_time(&mon.wasted_delay);
-                    mon.clear_time(&mon.injected_delay);
-                    mon.run();
-                }
             } else if (mon.status == MONITOR_OFF) {
                 // Wasted epoch time
                 clock_gettime(CLOCK_MONOTONIC, &start_ts);
                 uint64_t sleep_diff = (sleep_end_ts.tv_sec - sleep_start_ts.tv_sec) * 1000000000 +
                                       (sleep_end_ts.tv_nsec - sleep_start_ts.tv_nsec);
-                struct timespec sleep_time;
-                sleep_time.tv_sec = sleep_diff / 1000000000;
-                sleep_time.tv_nsec = sleep_diff % 1000000000;
+                struct timespec sleep_time {};
+                sleep_time.tv_sec = std::lround(sleep_diff / 1000000000);
+                sleep_time.tv_nsec = std::lround(sleep_diff % 1000000000);
                 mon.wasted_delay.tv_sec += sleep_time.tv_sec;
                 mon.wasted_delay.tv_nsec += sleep_time.tv_nsec;
                 LOG(DEBUG) << fmt::format("[{}:{}:{}][OFF] total: {}| wasted : {}| waittime : {}| squabble : {}\n", i,
                                           mon.tgid, mon.tid, mon.injected_delay.tv_nsec, mon.wasted_delay.tv_nsec,
                                           waittime.tv_nsec, mon.squabble_delay.tv_nsec);
                 if (monitors.check_continue(i, sleep_time)) {
-                    mon.clear_time(&mon.wasted_delay);
-                    mon.clear_time(&mon.injected_delay);
+                    Monitor::clear_time(&mon.wasted_delay);
+                    Monitor::clear_time(&mon.injected_delay);
                     mon.run();
                 }
                 clock_gettime(CLOCK_MONOTONIC, &end_ts);
@@ -412,16 +476,33 @@ int main(int argc, char *argv[]) {
                         LOG(DEBUG) << fmt::format("[SQ]total: {}| wasted : {}| waittime : {}| squabble : {}\n",
                                                   mon.injected_delay.tv_nsec, mon.wasted_delay.tv_nsec,
                                                   waittime.tv_nsec, mon.squabble_delay.tv_nsec);
-                        mon.clear_time(&mon.wasted_delay);
-                        mon.clear_time(&mon.injected_delay);
+                        Monitor::clear_time(&mon.wasted_delay);
+                        Monitor::clear_time(&mon.injected_delay);
                         mon.run();
                     } else {
                         mon.injected_delay.tv_nsec += mon.squabble_delay.tv_nsec;
-                        mon.clear_time(&mon.squabble_delay);
+                        Monitor::clear_time(&mon.squabble_delay);
                     }
                 }
             }
         } // End for-loop for all target processes
+        LOG(TRACE) << fmt::format("{}\n", monitors);
+        for (auto mon : monitors.mon) {
+            if (mon.status == MONITOR_ON) {
+                auto swap = mon.before;
+                mon.before = mon.after;
+                mon.after = swap;
+
+                /* continue suspended processes: send SIGCONT */
+                // mon.unfreeze_counters_cha_all(fds.msr[0]);
+                // start_pmc(&fds, i);
+                if (calibrated_delay == 0) {
+                    Monitor::clear_time(&mon.wasted_delay);
+                    Monitor::clear_time(&mon.injected_delay);
+                    mon.run();
+                }
+            }
+        }
         if (monitors.check_all_terminated(tnum)) {
             break;
         }
diff --git a/src/module.cc b/src/module.cc
new file mode 100644
index 0000000..7ac58cd
--- /dev/null
+++ b/src/module.cc
@@ -0,0 +1,207 @@
+//
+// Created by victoryang00 on 11/9/23.
+//
+
+/** for thread creation and memory monitor */
+#include "sock.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <dlfcn.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#define CXLMEMSIM_EXPORT __attribute__((visibility("default")))
+#define CXLMEMSIM_CONSTRUCTOR(n) __attribute__((constructor((n))))
+#define CXLMEMSIM_CONSTRUCTOR_PRIORITY 102
+
+typedef void *(*mmap_ptr_t)(void *, size_t, int, int, int, off_t);
+typedef int (*munmap_ptr_t)(void *, size_t);
+typedef void *(*malloc_ptr_t)(size_t);
+typedef int (*calloc_ptr_t)(void *, size_t);
+typedef void *(*realloc_ptr_t)(void *, size_t);
+typedef int (*posix_memalign_ptr_t)(void **, size_t, size_t);
+typedef void *(*aligned_alloc_ptr_t)(size_t, size_t);
+typedef int (*free_ptr_t)(void *);
+typedef int (*pthread_create_ptr_t)(pthread_t *, const pthread_attr_t *, void *(*)(void *), void *);
+typedef int (*pthread_join_ptr_t)(pthread_t, void **);
+typedef int (*pthread_detach_ptr_t)(pthread_t);
+typedef size_t (*malloc_usable_size_ptr_t)(void *);
+// typedef int (*mpi_send_t)(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm);
+
+typedef struct cxlmemsim_param {
+    int sock;
+    struct sockaddr_un addr;
+    mmap_ptr_t mmap;
+    munmap_ptr_t munmap;
+    malloc_ptr_t malloc;
+    calloc_ptr_t calloc;
+    realloc_ptr_t realloc;
+    posix_memalign_ptr_t posix_memalign;
+    aligned_alloc_ptr_t aligned_alloc;
+    free_ptr_t free;
+    pthread_create_ptr_t pthread_create;
+    pthread_join_ptr_t pthread_join;
+    pthread_detach_ptr_t pthread_detach;
+    malloc_usable_size_ptr_t malloc_usable_size;
+} cxlmemsim_param_t;
+
+cxlmemsim_param_t param = {.sock = 0,
+                           .addr = {},
+                           .mmap = nullptr,
+                           .munmap = nullptr,
+                           .malloc = nullptr,
+                           .free = nullptr,
+                           .pthread_create = nullptr,
+                           .pthread_join = nullptr,
+                           .pthread_detach = nullptr};
+
+inline void call_socket_with_int3() {
+    const char *message = "hello";
+    fprintf(stderr, "call_socket_with_int3\n");
+    // sendback tid
+
+    __asm__("int $0x3");
+}
+
+inline int init_mmap_ptr(void) {
+    if (param.mmap == nullptr) {
+        param.mmap = (mmap_ptr_t)dlsym(RTLD_NEXT, "mmap64");
+        if (!param.mmap) {
+            fprintf(stderr, "Error in dlsym(RTLD_NEXT,\"mmap\")\n");
+            return -1;
+        }
+    }
+    return 0;
+}
+
+CXLMEMSIM_EXPORT
+void *malloc(size_t size) {
+    call_socket_with_int3();
+    fprintf(stderr, "malloc%ld\n", size);
+    return param.malloc(size);
+}
+
+CXLMEMSIM_EXPORT
+void *calloc(size_t num, size_t size) {
+    call_socket_with_int3();
+    if (param.mmap == nullptr) {
+        return (void *)param.calloc;
+    }
+
+    return param.malloc(num * size);
+}
+
+CXLMEMSIM_EXPORT
+void *realloc(void *ptr, size_t size) {
+    call_socket_with_int3();
+    return param.realloc(ptr, size);
+}
+
+CXLMEMSIM_EXPORT
+int posix_memalign(void **memptr, size_t alignment, size_t size) {
+    call_socket_with_int3();
+    return param.posix_memalign(memptr, alignment, size);
+}
+
+CXLMEMSIM_EXPORT
+void *aligned_alloc(size_t alignment, size_t size) {
+    call_socket_with_int3();
+    return param.aligned_alloc(alignment, size);
+}
+
+CXLMEMSIM_EXPORT
+void free(void *ptr) {
+    call_socket_with_int3();
+    if (ptr == (void *)param.calloc) {
+        return;
+    }
+
+    param.free(ptr);
+}
+
+CXLMEMSIM_EXPORT
+void *mmap(void *start, size_t len, int prot, int flags, int fd, off_t off) {
+    call_socket_with_int3();
+    void *ret = NULL;
+    int mmap_initialized = init_mmap_ptr();
+
+    if (mmap_initialized != 0) {
+        fprintf(stderr, "init_mmap_ptr() failed\n");
+        return ret;
+    }
+    ret = param.mmap(start, len, prot, flags, fd, off);
+
+    return ret;
+}
+
+CXLMEMSIM_EXPORT
+void *mmap64(void *start, size_t len, int prot, int flags, int fd, off_t off) {
+    call_socket_with_int3();
+    return mmap(start, len, prot, flags, fd, off);
+}
+
+CXLMEMSIM_EXPORT
+size_t malloc_usable_size(void *ptr) { /* added for redis */
+    call_socket_with_int3();
+    return param.malloc_usable_size(ptr);
+}
+
+CXLMEMSIM_CONSTRUCTOR(CXLMEMSIM_CONSTRUCTOR_PRIORITY) static void cxlmemsim_constructor() {
+    // save the original impl of mmap
+
+    init_mmap_ptr();
+    param.munmap = (munmap_ptr_t)dlsym(RTLD_NEXT, "munmap");
+    if (!param.munmap) {
+        fprintf(stderr, "Error in dlsym(RTLD_NEXT,\"munmap\")\n");
+        exit(-1);
+    }
+    param.malloc = (malloc_ptr_t)dlsym(RTLD_NEXT, "malloc");
+    if (!param.malloc) {
+        fprintf(stderr, "Error in dlsym(RTLD_NEXT,\"malloc\")\n");
+        exit(-1);
+    }
+    param.free = (free_ptr_t)dlsym(RTLD_NEXT, "free");
+    if (!param.free) {
+        fprintf(stderr, "Error in dlsym(RTLD_NEXT,\"free\")\n");
+        exit(-1);
+    }
+    param.calloc = (calloc_ptr_t)dlsym(RTLD_NEXT, "calloc");
+    if (!param.calloc) {
+        fprintf(stderr, "Error in dlsym(RTLD_NEXT,\"calloc\")\n");
+        exit(-1);
+    }
+    param.realloc = (realloc_ptr_t)dlsym(RTLD_NEXT, "realloc");
+    if (!param.realloc) {
+        fprintf(stderr, "Error in dlsym(RTLD_NEXT,\"realloc\")\n");
+        exit(-1);
+    }
+    param.pthread_create = (pthread_create_ptr_t)dlsym(RTLD_NEXT, "pthread_create");
+    if (!param.pthread_create) {
+        fprintf(stderr, "Error in dlsym(RTLD_NEXT,\"pthread_create\")\n");
+        exit(-1);
+    }
+
+    param.pthread_detach = (pthread_detach_ptr_t)dlsym(RTLD_NEXT, "pthread_detach");
+    if (!param.pthread_detach) {
+        fprintf(stderr, "Error in dlsym(RTLD_NEXT,\"pthread_detach\")\n");
+        exit(-1);
+    }
+
+    param.pthread_join = (pthread_join_ptr_t)dlsym(RTLD_NEXT, "pthread_join");
+    if (!param.pthread_join) {
+        fprintf(stderr, "Error in dlsym(RTLD_NEXT,\"pthread_join\")\n");
+        exit(-1);
+    }
+    param.sock = socket(AF_UNIX, SOCK_DGRAM, 0);
+    /** register the original impl */
+    struct sockaddr_un addr {};
+    memset(&addr, 0, sizeof(struct sockaddr_un));
+    addr.sun_family = AF_UNIX;
+    strncpy(addr.sun_path, SOCKET_PATH, sizeof(addr.sun_path) - 1);
+    fprintf(stderr, "start\n");
+}
+
+__attribute__((destructor)) static void cxlmemsim_destructor() { fprintf(stderr, "fini"); }
diff --git a/src/monitor.cpp b/src/monitor.cpp
index 11b4f7c..20d3992 100644
--- a/src/monitor.cpp
+++ b/src/monitor.cpp
@@ -3,15 +3,14 @@
 //
 
 #include "monitor.h"
-Monitors::Monitors(int tnum, cpu_set_t *use_cpuset, int nmem, Helper h) {
-    mon = std::vector<Monitor>(tnum, Monitor(nmem, h));
-    /* init mon */
+Monitors::Monitors(int tnum, cpu_set_t *use_cpuset) : print_flag(true) {
+    mon = std::vector<Monitor>(tnum, Monitor());
+    /** Init mon */
     for (int i = 0; i < tnum; i++) {
         disable(i);
-        int cpucnt = 0;
-        int cpuid = 0;
-        for (cpuid = 0; cpuid < h.cpu; cpuid++) {
-            if (CPU_ISSET(cpuid, use_cpuset)) {
+        int cpucnt = 0, cpuid;
+        for (cpuid = 0; cpuid < helper.num_of_cpu(); cpuid++) {
+            if (!CPU_ISSET(cpuid, use_cpuset)) {
                 if (i == cpucnt) {
                     mon[i].cpu_core = cpuid;
                     break;
@@ -35,8 +34,15 @@ void Monitors::run_all(const int processes) {
         }
     }
 }
+Monitor Monitors::get_mon(int tgid, int tid) {
+    for (auto &i : mon) {
+        if (i.tgid == tgid && i.tid == tid) {
+            return i;
+        }
+    }
+}
 int Monitors::enable(const uint32_t tgid, const uint32_t tid, bool is_process, uint64_t pebs_sample_period,
-                     const int32_t tnum, bool is_page) {
+                     const int32_t tnum) {
     int target = -1;
 
     for (int i = 0; i < tnum; i++) {
@@ -77,22 +83,22 @@ int Monitors::enable(const uint32_t tgid, const uint32_t tid, bool is_process, u
     disable(target);
     mon[target].status = MONITOR_ON;
     mon[target].tgid = tgid;
-    mon[target].tid = tid;
+    mon[target].tid = tid; // We can setup the process here
     mon[target].is_process = is_process;
 
     if (pebs_sample_period) {
         /* pebs start */
-        mon[target].pebs_ctx = new PEBS(tgid, pebs_sample_period, is_page);
+        mon[target].pebs_ctx = new PEBS(tgid, pebs_sample_period);
         LOG(DEBUG) << fmt::format("{}Process [tgid={}, tid={}]: enable to pebs.\n", target, mon[target].tgid,
-                                  mon[target].tid);
+                                  mon[target].tid); // multiple tid multiple pid
     }
 
-    LOG(INFO) << fmt::format("========== Process {}[tgid={}, tid={}] monitoring start ==========\n", target,
-                             mon[target].tgid, mon[target].tid);
-    return 0;
+    LOG(INFO) << fmt::format("pid {}[tgid={}, tid={}] monitoring start\n", target, mon[target].tgid, mon[target].tid);
+
+    return target;
 }
 void Monitors::disable(const uint32_t target) {
-    mon[target].is_process = false;
+    mon[target].is_process = false; // Here to add the multi process.
     mon[target].status = MONITOR_DISABLE;
     mon[target].tgid = 0;
     mon[target].tid = 0;
@@ -114,9 +120,9 @@ void Monitors::disable(const uint32_t target) {
         mon[target].pebs_ctx->mp = nullptr;
         mon[target].pebs_ctx->sample_period = 0;
     }
-    for (int j = 0; j < 2; j++) {
-        mon[target].elem[j].pebs.total = 0;
-        mon[target].elem[j].pebs.llcmiss = 0;
+    for (auto &j : mon[target].elem) {
+        j.pebs.total = 0;
+        j.pebs.llcmiss = 0;
     }
 }
 bool Monitors::check_all_terminated(const uint32_t processes) {
@@ -181,8 +187,8 @@ bool Monitors::check_continue(const uint32_t target, const struct timespec w) {
     return false;
 }
 
-void Monitor::stop() {
-    int ret = -1;
+void Monitor::stop() { // thread create and proecess create get the pmu
+    int ret;
 
     if (this->is_process) {
         // In case of process, use SIGSTOP.
@@ -211,6 +217,7 @@ void Monitor::stop() {
         LOG(DEBUG) << fmt::format("Process [{}:{}] is stopped.\n", this->tgid, this->tid);
     }
 }
+
 void Monitor::run() {
     LOG(DEBUG) << fmt::format("Send SIGCONT to tid={}(tgid={})\n", this->tid, this->tgid);
     if (syscall(SYS_tgkill, this->tgid, this->tid, SIGCONT) == -1) {
@@ -229,24 +236,19 @@ void Monitor::run() {
         LOG(DEBUG) << fmt::format("Process [{}:{}] is running.\n", this->tgid, this->tid);
     }
 }
+
 void Monitor::clear_time(struct timespec *time) {
     time->tv_sec = 0;
     time->tv_nsec = 0;
 }
-Monitor::Monitor(const int nmem, Helper h)
+
+Monitor::Monitor() // which one to hook
     : tgid(0), tid(0), cpu_core(0), status(0), injected_delay({0}), wasted_delay({0}), squabble_delay({0}),
       before(nullptr), after(nullptr), total_delay(0), start_exec_ts({0}), end_exec_ts({0}), is_process(false),
       pebs_ctx(nullptr) {
+
     for (auto &j : this->elem) {
-        j.cpus = (struct CPUElem *)calloc(sizeof(struct CPUElem), h.cpu);
-        if (j.cpus == nullptr) {
-            LOG(ERROR) << "calloc";
-            throw;
-        }
-        j.cbos = (struct CBOElem *)calloc(sizeof(struct CBOElem), h.cbo);
-        if (j.cbos == nullptr) {
-            LOG(ERROR) << "calloc";
-            throw;
-        }
+        j.cpus = std::vector<CPUElem>(helper.used_cpu.size());
+        j.chas = std::vector<CHAElem>(helper.used_cha.size());
     }
-}
+}
\ No newline at end of file
diff --git a/src/pebs.cpp b/src/pebs.cpp
index 20f5f6c..ffe699e 100644
--- a/src/pebs.cpp
+++ b/src/pebs.cpp
@@ -24,17 +24,17 @@ struct perf_sample {
 long perf_event_open(struct perf_event_attr *event_attr, pid_t pid, int cpu, int group_fd, unsigned long flags) {
     return syscall(__NR_perf_event_open, event_attr, pid, cpu, group_fd, flags);
 }
-PEBS::PEBS(pid_t pid, uint64_t sample_period, bool is_page) : pid(pid), sample_period(sample_period), is_page(is_page) {
+PEBS::PEBS(pid_t pid, uint64_t sample_period) : pid(pid), sample_period(sample_period) {
     // Configure perf_event_attr struct
     struct perf_event_attr pe = {
         .type = PERF_TYPE_RAW,
         .size = sizeof(struct perf_event_attr),
         .config = 0x20d1, // mem_load_retired.l3_miss
-        .sample_period = 1,
+        .sample_period = sample_period,
         .sample_type = PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_ADDR | PERF_SAMPLE_READ | PERF_SAMPLE_PHYS_ADDR,
         .read_format = PERF_FORMAT_TOTAL_TIME_ENABLED,
         .disabled = 1, // Event is initially disabled
-        .exclude_kernel = 0,
+        .exclude_kernel = 1,
         .precise_ip = 1,
         .config1 = 3,
     }; // excluding events that happen in the kernel-space
@@ -50,7 +50,7 @@ PEBS::PEBS(pid_t pid, uint64_t sample_period, bool is_page) : pid(pid), sample_p
     }
 
     this->mplen = MMAP_SIZE;
-    this->mp = (perf_event_mmap_page *)mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, this->fd, 0);
+    this->mp = (perf_event_mmap_page *)mmap(nullptr, MMAP_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, this->fd, 0);
 
     if (this->mp == MAP_FAILED) {
         perror("mmap");
@@ -68,14 +68,13 @@ int PEBS::read(CXLController *controller, struct PEBSElem *elem) {
         return -1;
 
     int r = 0;
-    int i;
     struct perf_event_header *header;
     struct perf_sample *data;
     uint64_t last_head;
     char *dp = ((char *)mp) + PAGE_SIZE;
 
     do {
-        this->seq = mp->lock;
+        this->seq = mp->lock; // explicit copy
         barrier();
         last_head = mp->data_head;
         while ((uint64_t)this->rdlen < last_head) {
@@ -99,7 +98,7 @@ int PEBS::read(CXLController *controller, struct PEBSElem *elem) {
                                               data->value, data->timestamp);
                     controller->insert(data->timestamp, data->phys_addr, data->addr, 0);
                     elem->total++;
-                    elem->llcmiss = data->value;
+                    elem->llcmiss = data->value; // this is the number of llc miss
                 }
                 break;
             case PERF_RECORD_THROTTLE:
@@ -122,7 +121,7 @@ int PEBS::read(CXLController *controller, struct PEBSElem *elem) {
         mp->data_tail = last_head;
         barrier();
     } while (mp->lock != this->seq);
-
+    
     return r;
 }
 int PEBS::start() {
diff --git a/src/perf.cpp b/src/perf.cpp
index c70c8ce..b8c81c5 100644
--- a/src/perf.cpp
+++ b/src/perf.cpp
@@ -4,359 +4,7 @@
 
 #include "perf.h"
 #include "pebs.h"
-#include <gelf.h>
 
-#define MAX_MAPS 32
-#define DEBUGFS "/sys/kernel/debug/tracing/"
-struct bpf_map_def {
-    unsigned int type;
-    unsigned int key_size;
-    unsigned int value_size;
-    unsigned int max_entries;
-    unsigned int map_flags;
-    unsigned int inner_map_idx;
-};
-struct bpf_map_data {
-    int fd;
-    char *name;
-    size_t elf_offset;
-    struct bpf_map_def def;
-};
-static char license[128];
-static int kern_version;
-static bool processed_sec[128];
-char bpf_log_buf[BPF_LOG_BUF_SIZE];
-int map_fd[MAX_MAPS];
-int prog_fd;
-int event_fd;
-int prog_array_fd = -1;
-struct bpf_map_data map_data[MAX_MAPS];
-int map_data_count = 0;
-static int cmp_symbols(const void *l, const void *r) {
-    const GElf_Sym *lsym = (const GElf_Sym *)l;
-    const GElf_Sym *rsym = (const GElf_Sym *)r;
-
-    if (lsym->st_value < rsym->st_value)
-        return -1;
-    else if (lsym->st_value > rsym->st_value)
-        return 1;
-    else
-        return 0;
-}
-static int load_maps(struct bpf_map_data *maps, int nr_maps) {
-    int i;
-
-    for (i = 0; i < nr_maps; i++) {
-        if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS || maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) {
-            int inner_map_fd = map_fd[maps[i].def.inner_map_idx];
-            struct bpf_map_create_opts opt = {
-                .inner_map_fd = static_cast<__u32>(inner_map_fd),
-                .map_flags = maps[i].def.map_flags,
-                .numa_node = 0,
-            };
-
-            map_fd[i] = bpf_map_create(((enum bpf_map_type)maps[i].def.type), "my_map", maps[i].def.key_size,
-                                       maps[i].def.value_size, maps[i].def.max_entries, &opt);
-        } else {
-            struct bpf_map_create_opts opt = {
-                .map_flags = maps[i].def.map_flags,
-                .numa_node = 0,
-            };
-            map_fd[i] = bpf_map_create(((enum bpf_map_type)maps[i].def.type), "my_map", maps[i].def.key_size,
-                                       maps[i].def.value_size, maps[i].def.max_entries, &opt);
-        }
-        if (map_fd[i] < 0) {
-            LOG(ERROR) << fmt::format("failed to create a map: {} {}\n", errno, strerror(errno));
-            return 1;
-        }
-        maps[i].fd = map_fd[i];
-
-        if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY)
-            prog_array_fd = map_fd[i];
-    }
-    return 0;
-}
-
-static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, GElf_Shdr *shdr, struct bpf_insn *insn,
-                                struct bpf_map_data *maps, int nr_maps) {
-    int i, nrels;
-
-    nrels = shdr->sh_size / shdr->sh_entsize;
-
-    for (i = 0; i < nrels; i++) {
-        GElf_Sym sym;
-        GElf_Rel rel;
-        unsigned int insn_idx;
-        bool match = false;
-        int map_idx;
-
-        gelf_getrel(data, i, &rel);
-
-        insn_idx = rel.r_offset / sizeof(struct bpf_insn);
-
-        gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym);
-
-        if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
-            LOG(ERROR) << fmt::format("invalid relo for insn[{}].code 0x%x\n", insn_idx, insn[insn_idx].code);
-            return 1;
-        }
-        insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
-
-        /* Match FD relocation against recorded map_data[] offset */
-        for (map_idx = 0; map_idx < nr_maps; map_idx++) {
-            if (maps[map_idx].elf_offset == sym.st_value) {
-                match = true;
-                break;
-            }
-        }
-        if (match) {
-            insn[insn_idx].imm = maps[map_idx].fd;
-        } else {
-            LOG(ERROR) << fmt::format("invalid relo for insn[{}] no map_data match\n", insn_idx);
-            return 1;
-        }
-    }
-
-    return 0;
-}
-
-static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname, GElf_Shdr *shdr, Elf_Data **data) {
-    Elf_Scn *scn;
-
-    scn = elf_getscn(elf, i);
-    if (!scn)
-        return 1;
-
-    if (gelf_getshdr(scn, shdr) != shdr)
-        return 2;
-
-    *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
-    if (!*shname || !shdr->sh_size)
-        return 3;
-
-    *data = elf_getdata(scn, 0);
-    if (!*data || elf_getdata(scn, *data) != nullptr)
-        return 4;
-
-    return 0;
-}
-
-static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx, Elf *elf, Elf_Data *symbols,
-                                 int strtabidx) {
-    int map_sz_elf, map_sz_copy;
-    bool validate_zero = false;
-    Elf_Data *data_maps;
-    int i, nr_maps;
-    GElf_Sym *sym;
-    Elf_Scn *scn;
-    int copy_sz;
-
-    if (maps_shndx < 0)
-        return -EINVAL;
-    if (!symbols)
-        return -EINVAL;
-
-    /* Get data for maps section via elf index */
-    scn = elf_getscn(elf, maps_shndx);
-    if (scn)
-        data_maps = elf_getdata(scn, NULL);
-    if (!scn || !data_maps) {
-        printf("Failed to get Elf_Data from maps section {}\n", maps_shndx);
-        return -EINVAL;
-    }
-
-    /* For each map get corrosponding symbol table entry */
-    sym = static_cast<GElf_Sym *>(calloc(MAX_MAPS + 1, sizeof(GElf_Sym)));
-    for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
-        if (!gelf_getsym(symbols, i, &sym[nr_maps]))
-            continue;
-        if (sym[nr_maps].st_shndx != maps_shndx)
-            continue;
-        /* Only increment iif maps section */
-        nr_maps++;
-    }
-
-    /* Align to map_fd[] order, via sort on offset in sym.st_value */
-    qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols);
-
-    map_sz_elf = data_maps->d_size / nr_maps;
-    map_sz_copy = sizeof(struct bpf_map_def);
-    if (map_sz_elf < map_sz_copy) {
-        /*
-         * Backward compat, loading older ELF file with
-         * smaller struct, keeping remaining bytes zero.
-         */
-        map_sz_copy = map_sz_elf;
-    } else if (map_sz_elf > map_sz_copy) {
-        /*
-         * Forward compat, loading newer ELF file with larger
-         * struct with unknown features. Assume zero means
-         * feature not used.  Thus, validate rest of struct
-         * data is zero.
-         */
-        validate_zero = true;
-    }
-
-    /* Memcpy relevant part of ELF maps data to loader maps */
-    for (i = 0; i < nr_maps; i++) {
-        unsigned char *addr, *end;
-        struct bpf_map_def *def;
-        const char *map_name;
-        size_t offset;
-
-        map_name = elf_strptr(elf, strtabidx, sym[i].st_name);
-        maps[i].name = strdup(map_name);
-        if (!maps[i].name) {
-            printf("strdup({}): {}({})\n", map_name, strerror(errno), errno);
-            free(sym);
-            return -errno;
-        }
-
-        /* Symbol value is offset into ELF maps section data area */
-        offset = sym[i].st_value;
-        def = (struct bpf_map_def *)(((long)data_maps->d_buf) + offset);
-        maps[i].elf_offset = offset;
-        memset(&maps[i].def, 0, sizeof(struct bpf_map_def));
-        memcpy(&maps[i].def, def, map_sz_copy);
-
-        /* Verify no newer features were requested */
-        if (validate_zero) {
-            addr = (unsigned char *)def + map_sz_copy;
-            end = (unsigned char *)def + map_sz_elf;
-            for (; addr < end; addr++) {
-                if (*addr != 0) {
-                    free(sym);
-                    return -EFBIG;
-                }
-            }
-        }
-    }
-
-    free(sym);
-    return nr_maps;
-}
-
-static perf_event_attr load_and_attach(const char *event, struct bpf_insn *prog, int size, int pid, int cpu) {
-    bool is_socket = strncmp(event, "socket", 6) == 0;
-    bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
-    bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
-    bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
-    bool is_xdp = strncmp(event, "xdp", 3) == 0;
-    bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
-    bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0;
-    bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
-    size_t insns_cnt = size / sizeof(struct bpf_insn);
-    enum bpf_prog_type prog_type;
-    char buf[256];
-    int fd, efd, err, id;
-    struct perf_event_attr attr = {};
-
-    attr.type = PERF_TYPE_TRACEPOINT;
-    attr.sample_type = PERF_SAMPLE_RAW;
-    attr.sample_period = 1;
-    attr.wakeup_events = 1;
-
-    if (is_socket) {
-        prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
-    } else if (is_kprobe || is_kretprobe) {
-        prog_type = BPF_PROG_TYPE_KPROBE;
-    } else if (is_tracepoint) {
-        prog_type = BPF_PROG_TYPE_TRACEPOINT;
-    } else if (is_xdp) {
-        prog_type = BPF_PROG_TYPE_XDP;
-    } else if (is_perf_event) {
-        prog_type = BPF_PROG_TYPE_PERF_EVENT;
-    } else if (is_cgroup_skb) {
-        prog_type = BPF_PROG_TYPE_CGROUP_SKB;
-    } else if (is_cgroup_sk) {
-        prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
-    } else {
-        LOG(ERROR) << fmt::format("Unknown event '{}'\n", event);
-        throw;
-    }
-
-    fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version, bpf_log_buf, BPF_LOG_BUF_SIZE);
-    if (fd < 0) {
-        LOG(ERROR) << fmt::format("bpf_load_program() err={}\n{}", errno, bpf_log_buf);
-        throw;
-    }
-
-    prog_fd = fd;
-
-    if (is_kprobe || is_kretprobe) {
-        if (is_kprobe)
-            event += 7;
-        else
-            event += 10;
-
-        if (*event == 0) {
-            LOG(ERROR) << fmt::format("event name cannot be empty\n");
-            throw;
-        }
-
-        snprintf(buf, sizeof(buf), "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events", is_kprobe ? 'p' : 'r',
-                 event, event);
-        err = system(buf);
-        LOG(INFO) << fmt::format("echo '{}:{} {}' >> /sys/kernel/debug/tracing/kprobe_events", is_kprobe ? 'p' : 'r',
-                                 event, event);
-        if (err < 0) {
-            LOG(ERROR) << fmt::format("failed to create kprobe '{}' error '{}'\n", event, strerror(errno));
-        }
-
-        strcpy(buf, DEBUGFS);
-        strcat(buf, "events/kprobes/");
-        strcat(buf, event);
-        strcat(buf, "/id");
-    } else if (is_tracepoint) {
-        event += 11;
-
-        if (*event == 0) {
-            LOG(ERROR) << fmt::format("event name cannot be empty\n");
-            throw;
-        }
-        strcpy(buf, DEBUGFS);
-        strcat(buf, "events/");
-        strcat(buf, event);
-        strcat(buf, "/id");
-    }
-
-    efd = open(buf, O_RDONLY, 0);
-    if (efd < 0) {
-        LOG(ERROR) << fmt::format("failed to open event {}\n", event);
-        throw;
-    }
-
-    err = read(efd, buf, sizeof(buf));
-    if (err < 0 || err >= sizeof(buf)) {
-        LOG(ERROR) << fmt::format("read from '{}' failed '{}'\n", event, strerror(errno));
-        throw;
-    }
-
-    close(efd);
-
-    buf[err] = 0;
-    id = atoi(buf);
-    attr.config = id;
-
-    efd = perf_event_open(&attr, pid, cpu, -1, 0);
-    if (efd < 0) {
-        LOG(ERROR) << fmt::format("event {} fd {} err {}\n", id, efd, strerror(errno));
-        throw;
-    }
-    event_fd = efd;
-    ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
-    ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
-    return attr;
-}
-
-// PerfInfo::PerfInfo() {
-//     this->fd = perf_event_open(&this->attr, this->pid, this->cpu, this->group_fd, this->flags);
-//     if (this->fd == -1) {
-//         LOG(ERROR) << "perf_event_open";
-//         throw;
-//     }
-//     ioctl(this->fd, PERF_EVENT_IOC_RESET, 0);
-// }
 PerfInfo::PerfInfo(int group_fd, int cpu, pid_t pid, unsigned long flags, struct perf_event_attr attr)
     : group_fd(group_fd), cpu(cpu), pid(pid), flags(flags), attr(attr) {
     this->fd = perf_event_open(&this->attr, this->pid, this->cpu, this->group_fd, this->flags);
@@ -366,13 +14,7 @@ PerfInfo::PerfInfo(int group_fd, int cpu, pid_t pid, unsigned long flags, struct
     }
     ioctl(this->fd, PERF_EVENT_IOC_RESET, 0);
 }
-PerfInfo::PerfInfo(int fd, int group_fd, int cpu, pid_t pid, unsigned long flags, struct perf_event_attr attr)
-    : fd(fd), group_fd(group_fd), cpu(cpu), pid(pid), flags(flags), attr(attr) {
-    this->map = new ThreadSafeMap();
-    this->j = std::jthread{[&] { write_trace_to_map(map); }};
-}
 PerfInfo::~PerfInfo() {
-    this->j.join();
     if (this->fd != -1) {
         close(this->fd);
         this->fd = -1;
@@ -384,212 +26,53 @@ PerfInfo::~PerfInfo() {
  *   This can be avoided by executing nanosleep with 0.
  */
 ssize_t PerfInfo::read_pmu(uint64_t *value) {
-    struct timespec zero = {0};
-    nanosleep(&zero, nullptr);
     ssize_t r = read(this->fd, value, sizeof(*value));
     if (r < 0) {
-        LOG(ERROR) << "read";
+        LOG(ERROR) << "read\n";
     }
     return r;
 }
 int PerfInfo::start() {
     if (ioctl(this->fd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
-        LOG(ERROR) << "ioctl";
+        LOG(ERROR) << "ioctl\n";
         return -1;
     }
     return 0;
 }
 int PerfInfo::stop() {
     if (ioctl(this->fd, PERF_EVENT_IOC_DISABLE, 0) < 0) {
-        LOG(ERROR) << "ioctl";
+        LOG(ERROR) << "ioctl\n";
         return -1;
     }
     return 0;
 }
-std::map<uint64_t, uint64_t> PerfInfo::read_trace_pipe() {
-    auto traces = map->get();
-    std::map<uint64_t, uint64_t> addr_map;
-    for (auto r : traces) {
-        std::cout << r.first << " " << std::get<0>(r.second) << " " << std::get<1>(r.second) << std::endl;
-        // address, length, time -> address, length no lazyaccess
-        addr_map[r.first] = std::get<0>(r.second);
-    }
-    map->reset();
-    return addr_map;
-}
 
 PerfInfo *init_incore_perf(const pid_t pid, const int cpu, uint64_t conf, uint64_t conf1) {
-    int r, n_pid, n_cpu, group_fd, flags;
+    int n_pid, n_cpu, group_fd, flags;
     struct perf_event_attr attr {
         .type = PERF_TYPE_RAW, .size = sizeof(attr), .config = conf, .disabled = 1, .inherit = 1, .config1 = conf1,
         .clockid = 0
     };
-    if ((0 <= cpu) && (cpu < Helper::num_of_cpu())) {
-        n_pid = -1;
-        n_cpu = cpu;
-    } else {
-        n_pid = pid;
-        n_cpu = -1;
-    }
+    n_pid = -1;
+    n_cpu = cpu;
 
     group_fd = -1;
     flags = 0x08;
 
-    return new PerfInfo(group_fd, n_cpu, n_pid, static_cast<unsigned long>(flags), attr);
+    return new PerfInfo{group_fd, n_cpu, n_pid, static_cast<unsigned long>(flags), attr};
 }
 
-PerfInfo *init_incore_bpf_perf(const pid_t pid, const int cpu) {
-    int fd, i, ret, maps_shndx = -1, strtabidx = -1;
-    struct perf_event_attr attr {};
-    Elf *elf;
-    GElf_Ehdr ehdr;
-    GElf_Shdr shdr, shdr_prog;
-    Elf_Data *data, *data_prog, *data_maps = nullptr, *symbols = nullptr;
-    char *shname, *shname_prog;
-    int nr_maps = 0;
-
-    /* reset global variables */
-    kern_version = 0;
-    memset(license, 0, sizeof(license));
-    memset(processed_sec, 0, sizeof(processed_sec));
-
-    if (elf_version(EV_CURRENT) == EV_NONE)
-        throw;
-
-    fd = open("./collectmmap.o", O_RDONLY, 0);
-    if (fd < 0)
-        throw;
-
-    elf = elf_begin(fd, ELF_C_READ, nullptr);
-
-    if (!elf)
-        throw;
-
-    if (gelf_getehdr(elf, &ehdr) != &ehdr)
-        throw;
-
-    /* clear all kprobes */
-    i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events");
-
-    /* scan over all elf sections to get license and map info */
-    for (i = 1; i < ehdr.e_shnum; i++) {
-
-        if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
-            continue;
-
-        if (strcmp(shname, "license") == 0) {
-            processed_sec[i] = true;
-            memcpy(license, data->d_buf, data->d_size);
-        } else if (strcmp(shname, "version") == 0) {
-            processed_sec[i] = true;
-            if (data->d_size != sizeof(int)) {
-                LOG(ERROR) << fmt::format("invalid size of version section %zd\n", data->d_size);
-                throw;
-            }
-            memcpy(&kern_version, data->d_buf, sizeof(int));
-        } else if (strcmp(shname, "maps") == 0) {
-            int j;
-
-            maps_shndx = i;
-            data_maps = data;
-            for (j = 0; j < MAX_MAPS; j++)
-                map_data[j].fd = -1;
-        } else if (shdr.sh_type == SHT_SYMTAB) {
-            strtabidx = shdr.sh_link;
-            symbols = data;
-        }
-    }
-
-    ret = 1;
-
-    if (!symbols) {
-        LOG(ERROR) << fmt::format("missing SHT_SYMTAB section\n");
-        throw;
-    }
-
-    if (data_maps) {
-        nr_maps = load_elf_maps_section(map_data, maps_shndx, elf, symbols, strtabidx);
-        if (nr_maps < 0) {
-            LOG(ERROR) << fmt::format("Error: Failed loading ELF maps (errno:{}):{}\n", nr_maps, strerror(-nr_maps));
-            ret = 1;
-            throw;
-        }
-        if (load_maps(map_data, nr_maps))
-            throw;
-        map_data_count = nr_maps;
-
-        processed_sec[maps_shndx] = true;
-    }
-
-    /* load programs that need map fixup (relocations) */
-    for (i = 1; i < ehdr.e_shnum; i++) {
-        if (processed_sec[i])
-            continue;
-
-        if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
-            continue;
-        if (shdr.sh_type == SHT_REL) {
-            struct bpf_insn *insns;
-
-            if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog, &shdr_prog, &data_prog))
-                continue;
-
-            if (shdr_prog.sh_type != SHT_PROGBITS || !(shdr_prog.sh_flags & SHF_EXECINSTR))
-                continue;
-
-            insns = (struct bpf_insn *)data_prog->d_buf;
-
-            processed_sec[shdr.sh_info] = true;
-            processed_sec[i] = true;
-
-            if (parse_relo_and_apply(data, symbols, &shdr, insns, map_data, nr_maps))
-                continue;
-
-            if (memcmp(shname_prog, "kprobe/", 7) == 0 || memcmp(shname_prog, "kretprobe/", 10) == 0 ||
-                memcmp(shname_prog, "tracepoint/", 11) == 0 || memcmp(shname_prog, "xdp", 3) == 0 ||
-                memcmp(shname_prog, "perf_event", 10) == 0 || memcmp(shname_prog, "socket", 6) == 0 ||
-                memcmp(shname_prog, "cgroup/", 7) == 0)
-                attr = load_and_attach(shname_prog, insns, data_prog->d_size, pid, cpu);
-            return new PerfInfo(event_fd, -1, cpu, pid, 0, attr);
-        }
-    }
-
-    /* load programs that don't use maps */
-    for (i = 1; i < ehdr.e_shnum; i++) {
-
-        if (processed_sec[i])
-            continue;
-
-        if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
-            continue;
-
-        if (memcmp(shname, "kprobe/", 7) == 0 || memcmp(shname, "kretprobe/", 10) == 0 ||
-            memcmp(shname, "tracepoint/", 11) == 0 || memcmp(shname, "xdp", 3) == 0 ||
-            memcmp(shname, "perf_event", 10) == 0 || memcmp(shname, "socket", 6) == 0 ||
-            memcmp(shname, "cgroup/", 7) == 0)
-            attr = load_and_attach(shname, (struct bpf_insn *)data->d_buf, data->d_size, pid, cpu);
-        return new PerfInfo(event_fd, -1, cpu, pid, 0, attr);
-    }
-
-    return nullptr;
-}
+PerfInfo *init_uncore_perf(const pid_t pid, const int cpu, uint64_t conf, uint64_t conf1, int value) {
+    int group_fd = -1;
+    auto attr = perf_event_attr{
+        .type = (uint32_t)value,
+        .size = sizeof(struct perf_event_attr),
+        .config = conf,
+        .disabled = 1,
+        .inherit = 1,
+        .enable_on_exec = 1,
+        .config1 = conf1,
+    };
 
-void write_trace_to_map(ThreadSafeMap *map) {
-    std::ifstream fp(DEBUGFS "trace_pipe");
-    int i;
-    unsigned long size;
-    unsigned long address;
-    unsigned long long time;
-    std::string line;
-    while (std::getline(fp, line)) {
-        if (line.size() > 50 && line.contains("ls")) {
-            i = std::sscanf(line.substr(51, 57).c_str(), "bpf_trace_printk: munmap %lu %lu %llu", &size, &address,
-                            &time);
-            std::cout << line.substr(51, 57).c_str() << " " << i << std::endl;
-            if (i > 1) {
-                map->insert(address, size, time);
-                std::cout << address << " " << size << " " << time << std::endl;
-            }
-        }
-    }
+    return new PerfInfo{group_fd, cpu, pid, 0, attr};
 }
diff --git a/src/policy.cpp b/src/policy.cpp
index 4139be5..b1ca4b1 100644
--- a/src/policy.cpp
+++ b/src/policy.cpp
@@ -4,11 +4,26 @@
 
 #include "policy.h"
 #include <numeric>
-Policy::Policy() {}
-InterleavePolicy::InterleavePolicy() {}
+// TODO:
+AllocationPolicy::AllocationPolicy() = default;
+InterleavePolicy::InterleavePolicy() = default;
 // If the number is -1 for local, else it is the index of the remote server
 int InterleavePolicy::compute_once(CXLController *controller) {
-    auto per_size = controller->is_page ? 4096 : 64;
+    int per_size;
+    switch (controller->page_type_) {
+    case CACHELINE:
+        per_size = 64;
+        break;
+    case PAGE:
+        per_size = 4096;
+        break;
+    case HUGEPAGE_2M:
+        per_size = 2 * 1024 * 1024;
+        break;
+    case HUGEPAGE_1G:
+        per_size = 1024 * 1024 * 1024;
+        break;
+    };
     if (controller->occupation.size() * per_size / 1024 / 1024 < controller->capacity * 0.9) {
         return -1;
     } else {
diff --git a/src/sock.cc b/src/sock.cc
new file mode 100644
index 0000000..54fc44b
--- /dev/null
+++ b/src/sock.cc
@@ -0,0 +1,121 @@
+#include "sock.h"
+#include "cxlendpoint.h"
+#include "helper.h"
+#include "monitor.h"
+#include "policy.h"
+#include <cerrno>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <cxxopts.hpp>
+#include <sys/poll.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+Helper helper{};
+int main() {
+    // auto tnum = 1;
+    // auto pebsperiod = 1000000;
+    // std::vector<int> cpuset = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+    // std::vector<std::string> pmu_name = {"1","2","3","4","5","6","7","8"};
+    // std::vector<uint64_t> pmu_config1 = {0, 1, 2, 3, 4, 5, 6, 7};
+    // std::vector<uint64_t> pmu_config2 = {0, 1, 2, 3, 4, 5, 6, 7};
+    // uint64_t use_cpus = 0;
+    // cpu_set_t use_cpuset;
+    // CPU_ZERO(&use_cpuset);
+    // for (auto i : cpuset) {
+    //     if (!use_cpus || use_cpus & 1UL << i) {
+    //         CPU_SET(i, &use_cpuset);
+    //         LOG(DEBUG) << fmt::format("use cpuid: {}{}\n", i, use_cpus);
+    //     }
+    // }
+    auto sock = socket(AF_UNIX, SOCK_DGRAM, 0);
+    struct sockaddr_un addr {};
+
+    addr.sun_family = AF_UNIX;
+    strcpy(addr.sun_path, SOCKET_PATH);
+    remove(addr.sun_path);
+    if (bind(sock, (struct sockaddr *)&addr, sizeof(addr)) == -1) { // can be blocked for multi thread
+        LOG(ERROR) << "Failed to execute. Can't bind to a socket.\n";
+        exit(1);
+    }
+
+    size_t sock_buf_size = sizeof(op_data) + 1;
+    char *sock_buf = (char *)malloc(sock_buf_size);
+
+    // Monitors monitors{tnum, &use_cpuset};
+    // auto perf_config =
+    //     helper.detect_model(monitors.mon[0].before->cpuinfo.cpu_model, pmu_name, pmu_config1, pmu_config2);
+    // PMUInfo pmu{1234, &helper, &perf_config};
+
+    while (true) {
+        /** Get from the CXLMemSimHook */
+        int n;
+        do {
+            memset(sock_buf, 0, sock_buf_size);
+            // without blocking
+            n = recv(sock, sock_buf, sock_buf_size, MSG_DONTWAIT);
+            if (n < 1) {
+                if (errno == EAGAIN || errno == EWOULDBLOCK) {
+                    // no data
+                    break;
+                } else {
+                    LOG(ERROR) << "Failed to recv";
+                    exit(-1);
+                }
+            } else if (n >= sizeof(struct op_data) && n <= sock_buf_size - 1) {
+                auto *opd = (struct op_data *)sock_buf;
+                LOG(ERROR) << fmt::format("received data: size={}, tgid={}, tid=[], opcode={}\n", n, opd->tgid,
+                                          opd->tid, opd->opcode);
+
+                if (opd->opcode == CXLMEMSIM_THREAD_CREATE || opd->opcode == CXLMEMSIM_PROCESS_CREATE) {
+                    int t;
+                    bool is_process = opd->opcode == CXLMEMSIM_PROCESS_CREATE;
+                    // register to monitor
+                    LOG(DEBUG) << fmt::format("enable monitor: tgid={}, tid={}, is_process={}\n", opd->tgid, opd->tid,
+                                               is_process);
+
+                    // t = monitors.enable(opd->tgid, opd->tid, is_process, pebsperiod, tnum);
+                    if (t == -1) {
+                        LOG(ERROR) << "Failed to enable monitor\n";
+                    } else if (t < 0) {
+                        // tid not found. might be already terminated.
+                        continue;
+                    }
+                    // auto mon = monitors.mon[t];
+                    // Wait the t processes until emulation process initialized.
+                    // mon.stop();
+                    /* read CHA params */
+                    // for (auto const &[idx, value] : pmu.chas | enumerate) {
+                    //     pmu.chas[idx].read_cha_elems(&mon.before->chas[idx]);
+                    // }
+                    // for (auto const &[idx, value] : pmu.chas | enumerate) {
+                    //     pmu.chas[idx].read_cha_elems(&mon.before->chas[idx]);
+                    // }
+                    // // Run the t processes.
+                    // mon.run();
+                    // clock_gettime(CLOCK_MONOTONIC, &mon.start_exec_ts);
+                } else if (opd->opcode == CXLMEMSIM_THREAD_EXIT) {
+                    // unregister from monitor, and display results.
+                    // get the tid from the tgid
+                    LOG(ERROR)<< fmt::format("disable monitor: tgid={}, tid={}\n", opd->tgid, opd->tid);
+                    // auto mon = monitors.get_mon(opd->tgid, opd->tid);
+                    // mon.stop();
+                } else if (opd->opcode == CXLMEMSIM_STABLE_SIGNAL) {
+                    // for (auto const &[i, mon] : monitors.mon | enumerate) {
+                    //     if (mon.status == MONITOR_ON) {
+                    //         mon.stop();
+                    //         mon.status = MONITOR_SUSPEND;
+                    //     }
+                    // }
+                }
+
+            } else {
+                LOG(ERROR) << fmt::format("received data is invalid size: size={}", n);
+            }
+        } while (n > 0); // check the next message.
+    }
+}
\ No newline at end of file
diff --git a/src/uncore.cpp b/src/uncore.cpp
index 5101fc6..8dbdc00 100644
--- a/src/uncore.cpp
+++ b/src/uncore.cpp
@@ -3,14 +3,13 @@
 //
 
 #include "uncore.h"
+extern Helper helper;
 Uncore::Uncore(const uint32_t unc_idx, PerfConfig *perf_config) {
-    int ret, fd;
-    ssize_t r;
     unsigned long value;
+    int r;
     char path[64], buf[32];
-
     memset(path, 0, sizeof(path));
-    snprintf(path, sizeof(path) - 1, perf_config->path_format_cbo_type, unc_idx);
+    snprintf(path, sizeof(path) - 1, perf_config->path_format_cha_type.c_str(), unc_idx);
 
     fd = open(path, O_RDONLY);
     if (fd < 0) {
@@ -21,7 +20,7 @@ Uncore::Uncore(const uint32_t unc_idx, PerfConfig *perf_config) {
     memset(buf, 0, sizeof(buf));
     r = read(fd, buf, sizeof(buf) - 1);
     if (r < 0) {
-        LOG(ERROR) << fmt::format("read {} failed", path);
+        LOG(ERROR) << fmt::format("read {} failed", fd);
         close(fd);
         throw std::runtime_error("read");
     }
@@ -29,32 +28,26 @@ Uncore::Uncore(const uint32_t unc_idx, PerfConfig *perf_config) {
 
     value = strtoul(buf, nullptr, 10);
     if (value == ULONG_MAX) {
-        LOG(ERROR) << fmt::format("strtoul {} failed", path);
+        LOG(ERROR) << fmt::format("strtoul {} failed", fd);
         throw std::runtime_error("strtoul");
     }
 
-    int cpu = (int)unc_idx;
-    pid_t pid = -1; /* when using uncore, pid must be -1. */
-    int group_fd = -1;
-    auto attr = perf_event_attr{
-        .type = (uint32_t)value,
-        .size = sizeof(struct perf_event_attr),
-        .config = perf_config->cbo_config,
-        .disabled = 1,
-        .inherit = 1,
-        .enable_on_exec = 1,
-    };
-
-    /* when using uncore, don't set exclude_xxx flags. */
-    this->perf = new PerfInfo(group_fd, cpu, pid, 0, attr);
+    for (auto const &[k, v] : this->perf | enumerate) {
+        v = init_uncore_perf(-1, (int)unc_idx, std::get<1>(perf_config->cha[k]), std::get<2>(perf_config->cha[k]),
+                             value);
+    }
 }
 
-int Uncore::read_cbo_elems(struct CBOElem *elem) {
-    int r = this->perf->read_pmu(&elem->llc_wb);
-    if (r < 0) {
-        LOG(ERROR) << fmt::format("perf_read_pmu failed.\n");
+int Uncore::read_cha_elems(struct CHAElem *elem) {
+    ssize_t r;
+    for (auto const &[idx, value] : this->perf | enumerate) {
+        r = value->read_pmu(&elem->cha[idx]);
+        if (r < 0) {
+            LOG(ERROR) << fmt::format("read cha_elems[{}] failed.\n", std::get<0>(helper.perf_conf.cha[idx]));
+            return r;
+        }
+        LOG(DEBUG) << fmt::format("read cha_elems[{}]:{}\n", std::get<0>(helper.perf_conf.cha[idx]), elem->cha[idx]);
     }
 
-    LOG(DEBUG) << fmt::format("llc_wb:{}\n", elem->llc_wb);
-    return r;
+    return 0;
 }
diff --git a/workloads/CMakeLists.txt b/workloads/CMakeLists.txt
new file mode 100644
index 0000000..e69de29