add development for sapphire rapids

SlugLab · Nov 23, 2023 · d34018f · d34018f
1 parent b5181aa
commit d34018f
Show file tree

Hide file tree

Showing 62 changed files with 3,453 additions and 107,512 deletions.
diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
@@ -2,14 +2,15 @@ name: CMake
 
 on:
   push:
-    branches: [ "main" ]
+    branches: [ "main", "asplos-dev" ]
   pull_request:
-    branches: [ "main" ]
+    branches: [ "main", "asplos-dev"  ]
 
 env:
   # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
   BUILD_TYPE: Release
 
+
 jobs:
   build:
     # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
@@ -21,20 +22,18 @@ jobs:
     - uses: actions/checkout@v3
 
     - name: Install dependencies
-      run: sudo apt install llvm-dev clang libbpf-dev libclang-dev libcxxopts-dev libfmt-dev librange-v3-dev
-
-    - name: Install Custom dependencies
-      run: wget http://launchpadlibrarian.net/605552811/libbpf0_0.8.0-1_amd64.deb && wget http://launchpadlibrarian.net/605552807/libbpf-dev_0.8.0-1_amd64.deb && sudo dpkg -i ./libbpf0_0.8.0-1_amd64.deb && sudo dpkg -i ./libbpf-dev_0.8.0-1_amd64.deb 
+      run: sudo apt install llvm-dev clang libbpf-dev libclang-dev python3-pip gcc-13 g++-13 && git submodule update --init --recursive
 
-    - name: Sed Current uncompiled include file
-      run: sudo sed -i 's/NL_SET_ERR_MSG_MOD/\/\/NL_SET_ERR_MSG_MOD/g' /usr/src/linux-headers-`uname -r`/include/net/flow_offload.h
+    - name: Install conan
+      working-directory: ${{github.workspace}}
+      run: pip3 install conan && conan profile detect && mkdir build && cd build && cp ../conanfile.txt . && CC=gcc-13 CXX=g++-13 conan install . -s compiler.cppstd=gnu23
 
     - name: Configure CMake
       # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
       # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
-      run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
+      run: CC=gcc-13 CXX=g++-13 cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DCMAKE_TOOLCHAIN_FILE=${{github.workspace}}/build/build/${{env.BUILD_TYPE}}/generators/conan_toolchain.cmake -DCMAKE_POLICY_DEFAULT_CMP0091=NEW
 
     - name: Build
       # Build your program with the given configuration
-      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
+      run: CC=gcc-13 CXX=g++-13 cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
 
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,11 @@ benchmarks*
 Makefile
 *.o
 py_smdk_pkg
-lib
+lib
+CMakePresets.json
+.cmake
+CMakeUserPresets.json
+*~
+voltdb
+foo
+CMakeFiles
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,15 @@
+[submodule "workloads/memcached-ycsb"]
+	path = workloads/memcached-ycsb
+	url = https://github.com/SlugLab/YCSB/
+[submodule "workloads/memcached"]
+	path = workloads/memcached
+	url = https://github.com/memcached/memcached
+[submodule "workloads/gapbs"]
+	path = workloads/gapbs
+	url = https://github.com/victoryang00/gapbs
+[submodule "script/perfmon"]
+	path = script/perfmon
+	url = https://github.com/intel/perfmon
+[submodule "workloads/vectordb"]
+	path = workloads/vectordb
+	url = https://github.com/jina-ai/vectordb
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,29 +1,30 @@
 cmake_minimum_required(VERSION 3.11.0)
-project(CXL-MEM-Simulator VERSION 0.1.0)
+project(CXLMemSim VERSION 0.1.0)
+set(CMAKE_CXX_STANDARD 23)
+
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+    add_compile_options (-fdiagnostics-color=always)
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+    add_compile_options (-fcolor-diagnostics)
+endif ()
+
+add_subdirectory(microbench)
+add_subdirectory(workloads)
+
+list(APPEND CMAKE_PREFIX_PATH ${CMAKE_BINARY_DIR})
 
 find_package(cxxopts REQUIRED)
 find_package(fmt REQUIRED)
-find_package(range-v3 REQUIRED)
 file(GLOB_RECURSE SOURCE_FILES src/*.cpp)
 
 execute_process(COMMAND uname -r OUTPUT_VARIABLE arch OUTPUT_STRIP_TRAILING_WHITESPACE)
-set(LINUX_SOURCE /lib/modules/${arch}/build/)
-set(CMAKE_CXX_FLAGS "-Wall -g -pthread -lrt -rdynamic")
-set(CMAKE_CXX_STANDARD 23)
-
-add_executable(CXL-MEM-Simulator ${SOURCE_FILES})
-
-include_directories(CXL-MEM-Simulator include)
-target_link_libraries(CXL-MEM-Simulator cxxopts::cxxopts fmt::fmt range-v3::range-v3 elf bpf)
+set(CMAKE_CXX_FLAGS "-Wall -fPIC -pthread -ldl -lrt -mavx512f -mpreferred-stack-boundary=4 -g")
 
-function(bpf prefix)
-    add_custom_target(${prefix}_bpf ALL
-            COMMAND clang --target=bpf -nostdinc -S -I/usr/include/linux -I${CMAKE_SOURCE_DIR}/include -I${LINUX_SOURCE}/arch/x86/include -I/usr/include -I${LINUX_SOURCE}/arch/x86/include/uapi -I${LINUX_SOURCE}/arch/x86/include/generated -I${LINUX_SOURCE}/arch/x86/include/generated/uapi -I${LINUX_SOURCE}/include -I${LINUX_SOURCE}/include/uapi -I${LINUX_SOURCE}/include/generated/uapi -I${LINUX_SOURCE}/tools/testing/selftests/bpf -include ${LINUX_SOURCE}/include/linux/kconfig.h -D__KERNEL__ -D__ASM_SYSREG_H -D__BPF_TRACING__ -D__TARGET_ARCH_x86 -Wno-implicit-function-declaration -O3 -emit-llvm -g -c ${CMAKE_SOURCE_DIR}/src/${prefix}.c -o ${CMAKE_BINARY_DIR}/${prefix}.ll
-            COMMAND llc -march=bpf -filetype=obj -o ${CMAKE_BINARY_DIR}/${prefix}.o ${CMAKE_BINARY_DIR}/${prefix}.ll
-            )
-    add_dependencies(CXL-MEM-Simulator ${prefix}_bpf)
-endfunction()
+add_executable(CXLMemSim ${SOURCE_FILES} src/main.cc)
 
-bpf(collectmmap)
+include_directories(CXLMemSim include ${cxxopts_INCLUDE_DIR} ${fmt_INCLUDE_DIR})
+target_link_libraries(CXLMemSim fmt::fmt cxxopts::cxxopts)
 
-add_subdirectory(microbench)
+add_library(CXLMemSimHook SHARED src/module.cc)
+add_executable(CXLMemSimSock ${SOURCE_FILES} src/sock.cc)
+target_link_libraries(CXLMemSimSock fmt::fmt cxxopts::cxxopts)
diff --git a/README.md b/README.md
@@ -1,20 +1,20 @@
 # CXL.mem Simulator
-The epoch design of this project is mostly refering to [mes](https://github.com/takahiro-hirofuchi/mesmeric-emulator), the novelty is use pebs to construct the topology and calculate the hierachy latency based on this. See the [talk](https://docs.google.com/file/d/1bZi2rbB-u5xMw_YET726gb2s9QuxMZJE/edit?usp=docslist_api&filetype=mspresentation)
+The CXL.mem simulator is to use the target latency for simulating the CPU perspective taking ROB and different cacheline state's into panelty from the application level.
 
 ## Prerequisite
 ```bash
 $ uname -a
-Linux gpu01 5.19.0-29-generic #30-Ubuntu SMP PREEMPT_DYNAMIC Wed Jan 4 12:14:09 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
-$ sudo apt install llvm-dev clang libbpf-dev libclang-dev libcxxopts-dev libfmt-dev librange-v3-dev
+Linux banana 6.4.0+ #86 SMP PREEMPT_DYNAMIC Fri Jul 28 23:49:33 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
+$ echo 0 | sudo tee /sys/devices/system/node/node1/cpu*/online >/dev/null 2>&1
 ```
 ## User input
 ```bash
-LOGV=1 ./CXL-MEM-Simulator -t ./microbench/many_calloc -i 5 -c 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+LOGV=1 ./CXL-MEM-Simulator -t ./microbench/ld -i 5 -c 0,2 -d 85 -b 10,10 -l 100,100 -c 100,100 -w 85.5,86.5,87.5,85.5,86.5,87.5,88. -o "(1,(2,3))"
 ```
 1. -t Target: The path to the executable
 2. -i Interval: The epoch of the simulator, the parameter is in milisecond
 3. -c CPUSet: The core id to run the executable and the rest will be `setaffinity` to one other core
-4. -d Dram Latency: The current platform's DRAM latency, default is 85ns
+4. -d Dram Latency: The current platform's DRAM latency, default is 85ns # mark that bw in the remote
 5. -b, -l Bandwidth, Latency: Both use 2 input in the vector, first for read, second for write
 6. -c Capacity: The capacity of the memory with first be local, remaining accordingly to the input vector.
 7. -w Weight: Use the heuristic to calculate the bandwidth
@@ -30,16 +30,3 @@ LOGV=1 ./CXL-MEM-Simulator -t ./microbench/many_calloc -i 5 -c 0,1,2,3,4,5,6,7,8
                   3
 ```
 9. env LOGV stands for logs level that you can see.
-## Limitation
-The pebs requires no larger than 5 `perf_open_event` attached to certain PID, so I limit the bpf program to munmap(kprobe) and sbrk(kprobe/kretprobe), you can configure them. For multiple process application, I need to first SIGSTOP the process and `send/recv` back the PID information. For client and server application, I need to SIGSTOP/SIGCONT on both client and server simultaneously, which is not implemented yet.
-
-## Cite
-```bash
-@article{yangyarch23,
-  title={CXLMemSim: A pure software simulated CXL.mem for performance characterization},
-  author={Yiwei Yang, Pooneh Safayenikoo, Jiacheng Ma, Tanvir Ahmed Khan, Andrew Quinn},
-  journal={arXiv preprint arXiv:2303.06153},
-  booktitle={The fifth Young Architect Workshop (YArch'23)},
-  year={2023}
-}
-```
diff --git a/artifact/build_and_run_all.sh b/artifact/build_and_run_all.sh
diff --git a/artifact/compare_with_gem5.sh b/artifact/compare_with_gem5.sh
diff --git a/artifact/gen_workloads.sh b/artifact/gen_workloads.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# anns
+# monetdb
+# pointer_chasing lmbench3
+# wrf stream   
+# mlc
+# gromacs
+# smdk's
+
+git clone https://github.com/scott-beamer/gapbs.git
+cd gapbs
+make benchmark
+
+wget https://files.rcsb.org/download/4i4f.pdb
diff --git a/artifact/mlc.txt → artifact/mlc-alderlake.txt b/artifact/mlc.txt → artifact/mlc-alderlake.txt
diff --git a/artifact/mlc-sapphirerapids.txt b/artifact/mlc-sapphirerapids.txt
@@ -0,0 +1,55 @@
+Intel(R) Memory Latency Checker - v3.10
+*** Unable to modify prefetchers (try executing 'modprobe msr')
+*** So, enabling random access for latency measurements
+Measuring idle latencies for random access (in ns)...
+                Numa node   Numa node
+Numa node             0           1
+       0          106.3       437.5
+
+Measuring Peak Injection Memory Bandwidths for the system
+Bandwidths are in MB/sec (1 MB/sec = 1,000,000 Bytes/sec)
+Using all the threads from each core if Hyper-threading is enabled
+Using traffic with the following read-write ratios
+ALL Reads        :      28611.9
+3:1 Reads-Writes :      25057.1
+2:1 Reads-Writes :      25078.0
+1:1 Reads-Writes :      23965.9
+Stream-triad like:      24943.3
+
+Measuring Memory Bandwidths between nodes within system 
+Bandwidths are in MB/sec (1 MB/sec = 1,000,000 Bytes/sec)
+Using all the threads from each core if Hyper-threading is enabled
+Using Read-only traffic type
+                Numa node   Numa node
+Numa node             0            1
+       0        28612.0      19216.8
+
+Measuring Loaded Latencies for the system
+Using all the threads from each core if Hyper-threading is enabled
+Using Read-only traffic type
+Inject  Latency Bandwidth
+Delay   (ns)    MB/sec
+==========================
+00000   370.12   28393.2
+00002   369.29   28435.4
+00008   378.41   28490.5
+00015   354.32   28414.2
+00050   313.07   28323.0
+00100   238.51   28010.5
+00200   125.13   14566.0
+00300   119.69   10232.0
+00400   116.76   7905.3
+00500   115.33   6500.4
+00700   113.89   4858.3
+01000   113.03   3594.6
+01300   112.57   2906.9
+01700   112.09   2363.9
+02500   111.51   1798.9
+03500   111.21   1520.8
+05000   110.77   1193.2
+09000   110.38   922.3
+20000   110.14   735.6
+
+Measuring cache-to-cache transfer latency (in ns)...
+Local Socket L2->L2 HIT  latency        67.3
+Local Socket L2->L2 HITM latency        67.5
diff --git a/conanfile.txt b/conanfile.txt
@@ -0,0 +1,9 @@
+[requires]
+cxxopts/3.0.0
+fmt/9.0.0
+nlohmann_json/3.11.2
+[generators]
+CMakeDeps
+CMakeToolchain
+[layout]
+cmake_layout
diff --git a/include/alloc.h b/include/alloc.h
diff --git a/include/cxlcontroller.h b/include/cxlcontroller.h
@@ -2,43 +2,64 @@
 // Created by victoryang00 on 1/14/23.
 //
 
-#ifndef CXL_MEM_SIMULATOR_CXLCONTROLLER_H
-#define CXL_MEM_SIMULATOR_CXLCONTROLLER_H
+#ifndef CXLMEMSIM_CXLCONTROLLER_H
+#define CXLMEMSIM_CXLCONTROLLER_H
 
 #include "cxlcounter.h"
 #include "cxlendpoint.h"
 #include <cstdint>
 #include <string_view>
+#include <unordered_map>
 #include <vector>
 
+enum page_type { CACHELINE, PAGE, HUGEPAGE_2M, HUGEPAGE_1G };
+
 class CXLController;
-class Policy {
+class AllocationPolicy {
 public:
-    Policy();
+    AllocationPolicy();
     virtual int compute_once(CXLController *) = 0;
+    // No write problem
+};
+class MigrationPolicy {
+public:
+    MigrationPolicy();
+    virtual int compute_once(CXLController *) = 0; // reader writer
+    // paging related
+    // switching related
+};
+
+// need to give a timeout and will be added latency later,
+class PagingPolicy {
+public:
+    PagingPolicy();
+    virtual int compute_once(CXLController *) = 0; // reader writer
+    // paging related
 };
+
 class CXLController : CXLSwitch {
 public:
     std::vector<CXLMemExpander *> cur_expanders{};
     int capacity; // GB
-    Policy *policy;
+    AllocationPolicy *policy;
     CXLCounter counter;
     std::map<uint64_t, uint64_t> occupation;
     std::map<uint64_t, uint64_t> va_pa_map;
-    bool is_page;
+    enum page_type page_type_; // percentage
     int num_switches = 0;
-    CXLController(Policy *policy, int capacity, bool is_page, int epoch);
+
+    CXLController(AllocationPolicy *p, int capacity, enum page_type page_type_, int epoch);
     void construct_topo(std::string_view newick_tree);
     void insert_end_point(CXLMemExpander *end_point);
     std::vector<std::string> tokenize(const std::string_view &s);
-    std::tuple<double,std::vector<uint64_t>> calculate_congestion() override;
+    std::tuple<double, std::vector<uint64_t>> calculate_congestion() override;
     void set_epoch(int epoch) override;
     std::tuple<int, int> get_all_access() override;
-    double calculate_latency(LatencyPass elem); // traverse the tree to calculate the latency
-    double calculate_bandwidth(BandwidthPass elem);
+    double calculate_latency(LatencyPass elem) override; // traverse the tree to calculate the latency
+    double calculate_bandwidth(BandwidthPass elem) override;
     int insert(uint64_t timestamp, uint64_t phys_addr, uint64_t virt_addr, int index) override;
     void delete_entry(uint64_t addr, uint64_t length) override;
     std::string output() override;
 };
 
-#endif // CXL_MEM_SIMULATOR_CXLCONTROLLER_H
+#endif // CXLMEMSIM_CXLCONTROLLER_H