Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CUDA 12+ support with Nvbit #72

Merged
merged 43 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
7e99f47
nvbit
ejchung0406 Apr 14, 2023
2fba105
ejchung0406 Apr 14, 2023
bfc962e
ejchung0406 Apr 14, 2023
d978a66
ejchung0406 Apr 14, 2023
519bd9a
Merge branch 'master' into nvbit
ejchung0406 Apr 14, 2023
bad27da
ejchung0406 Apr 15, 2023
998a74a
ejchung0406 Apr 15, 2023
25c8e1a
ejchung0406 Apr 15, 2023
5933a69
merged from euijun's repo
jaewon-lee-github Apr 15, 2023
82db270
compile debugging
hyesoon Apr 15, 2023
d57714f
make it compile
hyesoon Apr 15, 2023
3ac3e23
add nvbit_sim parameters
hyesoon Apr 16, 2023
7f3a3bb
minor changes for nvbit file read
hyesoon Apr 16, 2023
f4114c6
minor changes for nvbit
hyesoon Apr 16, 2023
4512194
add nvbit core type checking
hyesoon Apr 16, 2023
62f6181
add nvbit uop latency
hyesoon Apr 17, 2023
88b1242
add nvbit latency
hyesoon Apr 17, 2023
7e437df
minor latency
hyesoon Apr 17, 2023
a26e650
Fixed bugs on Trace reader.
jaewon-lee-github Apr 17, 2023
98487a2
get size from trace_info_nvbit_small_s instead of trace_info_nvbit_s
jaewon-lee-github Apr 17, 2023
76ee746
default is GPU architecture
hyesoon Apr 17, 2023
0240d63
ejchung0406 Apr 17, 2023
4f6795c
Merge branch 'nvbit' of https://github.com/gthparch/macsim into gthpa…
ejchung0406 Apr 17, 2023
08983c3
Merge branch 'gthparch-nvbit' into nvbit
ejchung0406 Apr 17, 2023
1ce218c
ejchung0406 Apr 17, 2023
12350ae
ejchung0406 Apr 17, 2023
9a8766d
ejchung0406 Apr 17, 2023
769600d
Update uop.h
ejchung0406 Apr 17, 2023
c68094b
Nvbit (#63)
ejchung0406 Apr 19, 2023
ce99ccb
minor bug fix
ejchung0406 Apr 19, 2023
5852a7b
ejchung0406 Apr 19, 2023
92d2e83
Merge branch 'nvbit' into nvbit
ejchung0406 Apr 19, 2023
6f4a7fd
Merge pull request #2 from gthparch/nvbit
ejchung0406 Apr 19, 2023
de2b572
fixed some bugs
ejchung0406 Apr 20, 2023
0833cce
Merge pull request #64 from ejchung0406/nvbit
ejchung0406 Apr 20, 2023
b813ded
bug fix
ejchung0406 Jun 12, 2023
4310dcb
bug
ejchung0406 Jun 12, 2023
3bf5a4d
minor bug fix
ejchung0406 Feb 27, 2024
9262478
bug fix (gpu barrier)
ejchung0406 Feb 29, 2024
8246f66
added nvbit tracer for cuda 12+
ejchung0406 Oct 29, 2024
c245754
Merge branch 'master' into nvbit
ejchung0406 Oct 29, 2024
6f44f98
added example nvbit trace
ejchung0406 Oct 29, 2024
bee642e
removed m_trace_id
ejchung0406 Oct 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ src/statsEnums.h
src/cscope
tools/mem_trace_generator/*.raw
tools/mem_trace_generator/mem_trace
bin/
bin/macsim
bin/*.out
bin/*.out.0
.dbg_build/
*.lo
*.la
Expand Down
5 changes: 4 additions & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
[submodule "internal"]
path = internal
url = https://github.com/gthparch/macsim_internal
url = git@github.com:gthparch/macsim_internal.git
[submodule "src/rwqueue"]
path = src/rwqueue
url = https://github.com/cameron314/readerwriterqueue
[submodule "tools/CUDA_trace_generator"]
path = tools/CUDA_trace_generator
url = git@github.com:ejchung0406/CUDA_trace_generator.git
1 change: 1 addition & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ src/trace_read_cpu.cc src/trace_read_cpu.h \
src/trace_read_gpu.cc src/trace_read_gpu.h \
src/trace_read_a64.cc src/trace_read_a64.h \
src/trace_read_igpu.cc src/trace_read_igpu.h \
src/trace_read_nvbit.cc src/trace_read_nvbit.h \
src/uop.cc src/uop.h \
src/utils.cc src/utils.h \
src/network.cc src/network.h \
Expand Down
1 change: 1 addition & 0 deletions SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ macsim_src = [
'src/trace_read_gpu.cc',
'src/trace_read_a64.cc',
'src/trace_read_igpu.cc',
'src/trace_read_nvbit.cc',
'src/page_mapping.cc',
'src/dyfr.cc',
'src/hmc_process.cc',
Expand Down
152 changes: 89 additions & 63 deletions bin/params.in
Original file line number Diff line number Diff line change
@@ -1,85 +1,112 @@

# Simulation Configuration
num_sim_cores 1
num_sim_small_cores 0
num_sim_cores 16
num_sim_small_cores 16
core_type nvbit
max_threads_per_core 1024
num_sim_medium_cores 0
num_sim_large_cores 1
large_core_type x86
sim_cycle_count 0
max_insts 3000000
heartbeat_interval 1000000
forward_progress_limit 1000000
core_thread_sched balanced
num_sim_large_cores 0


# Clock
clock_cpu 1.15
clock_gpu 1.15
clock_noc 1.15
clock_mc 1.15
# from device query for gtx580 on damint - gpu clock - 1.66 GHz, mem clock - 2100 MHz
clock_cpu 2.0
clock_gpu 2.0
clock_llc 2.0
clock_noc 2.0
clock_mc 2.0


# Common Core Configuration
# Small Core Configuration
fetch_wdith 4
width 1
fetch_latency 5
alloc_latency 5
rob_size 1024
schedule ooo
isched_rate 4
msched_rate 4
fsched_rate 4
bp_hist_length 14
max_block_per_core 8
fetch_policy rr
mt_no_fetch_br 1
one_cycle_exec 0
uop_latency_map x86

# Large Core Configuration
large_width 2
large_core_fetch_latency 5
large_core_alloc_latency 10
isched_large_rate 4
msched_large_rate 2
fsched_large_rate 2
ssched_large_rate 1
isched_large_size 64
msched_large_size 32
fsched_large_size 96
ssched_large_size 128
bp_hist_length 16
rob_large_size 512
large_core_schedule ooo
max_threads_per_large_core 7

mem_mshr_size 9

# L3-I
icache_large_num_set 4096 # 768 KB
icache_large_assoc 3
icache_large_line_size 64
icache_large_cycles 14

# L3-D
l1_large_num_set 512 # 512 KB
l1_large_assoc 16
l1_large_line_size 64
l1_large_latency 100
l1_large_bypass 0
fetch_only_load_ready 0
schedule_ratio 4
fetch_ratio 4
gpu_sched 1
icache_num_set 8


# Memory
memory_type igpu_network
memory_type l2_decoupled_network
perfect_dcache 0
enable_cache_coherence 0
dram_merge_requests 1
mem_ooo_stores 0
ptx_common_cache 0
const_cache_size 8192
texture_cache_size 8192
shared_mem_size 16384
shared_mem_banks 32
shared_mem_cycles 2
shared_mem_ports 1
byte_level_access 0

l1_small_line_size 128
#96 KB
l1_small_num_set 128
l1_small_assoc 6
#16 KB
#l1_small_num_set 32
#l1_small_assoc 4

l1_small_latency 30
l2_small_latency 100
llc_latency 200

# L3 Cache (4.5MB 24 way)
num_llc 12
llc_num_set 128
llc_line_size 128
llc_assoc 24
llc_num_bank 4
llc_latency 200

# LLC
num_l3 1
l3_num_set 8192
l3_assoc 32
l3_line_size 64
l3_latency 100

# DRAM
dram_bus_width 4
dram_num_mc 6
dram_bus_width 8
dram_column 11
dram_activate 25
dram_precharge 10
dram_num_banks 16
dram_num_channel 8
dram_rowbuffer_size 2048
dram_scheduling_policy FRFCFS
dram_additional_latency 95

# ETC


infinite_port 0
pref_train_inst_once 0
pref_framework_on 1



bug_detector_enable 1

perfect_icache 1
ideal_noc 1
sim_cycle_count 0
max_insts 200000000
heartbeat_interval 1000000
forward_progress_limit 100000
blocks_to_simulate 0
ptx_exec_ratio 2
num_warp_scheduler 2


noc_topology simple_noc
noc_dimension 0
link_width 32

# DEBUG
debug_core_id 0
Expand All @@ -93,9 +120,8 @@ debug_dcu_stage 0
debug_retire_stage 0
debug_map_stage 0
debug_mem 0
debug_trace_read 0
debug_trace_read 1
debug_print_trace 1
debug_sim_thread_schedule 0
debug_cache_lib 0
debug_bp_dir 0
debug_print_trace 0
debug_noc 0
2 changes: 1 addition & 1 deletion bin/trace_file_list
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
1
../sst-unit-test/traces/cachesize_1/trace.txt
../sst-unit-test/traces/nvbit/vectormultadd/65536/kernel_config.txt
Loading
Loading