From 99c6a1af5a58cbd915f2a0ed9fe279d77d99498a Mon Sep 17 00:00:00 2001 From: Varsha Singhania Date: Mon, 17 Jun 2024 04:28:51 -0400 Subject: [PATCH 1/7] Tensor cores in Vortex --- ci/blackbox.sh | 12 +- hw/rtl/VX_config.vh | 18 ++ hw/rtl/VX_types.vh | 3 + kernel/include/vx_intrinsics.h | 20 ++ runtime/include/vortex.h | 2 + runtime/simx/vortex.cpp | 8 +- sim/simx/arch.h | 15 +- sim/simx/core.cpp | 4 +- sim/simx/core.h | 1 + sim/simx/decode.cpp | 19 ++ sim/simx/emulator.cpp | 18 +- sim/simx/emulator.h | 4 + sim/simx/execute.cpp | 179 +++++++++++++++ sim/simx/func_unit.cpp | 91 ++++++-- sim/simx/func_unit.h | 9 + sim/simx/instr.h | 2 +- sim/simx/instr_trace.h | 1 + sim/simx/main.cpp | 4 +- sim/simx/types.h | 23 +- tests/regression/matmul/Makefile | 14 ++ tests/regression/matmul/common.h | 17 ++ tests/regression/matmul/kernel.cpp | 124 +++++++++++ tests/regression/matmul/main.cpp | 343 +++++++++++++++++++++++++++++ 23 files changed, 899 insertions(+), 32 deletions(-) create mode 100644 tests/regression/matmul/Makefile create mode 100644 tests/regression/matmul/common.h create mode 100644 tests/regression/matmul/kernel.cpp create mode 100644 tests/regression/matmul/main.cpp diff --git a/ci/blackbox.sh b/ci/blackbox.sh index fe94677aa..8a04133f9 100755 --- a/ci/blackbox.sh +++ b/ci/blackbox.sh @@ -48,6 +48,8 @@ PERF_CLASS=0 REBUILD=2 TEMPBUILD=0 LOGFILE=run.log +TC_SIZE=567 +TC_NUM=123 for i in "$@" do @@ -112,6 +114,14 @@ case $i in LOGFILE=${i#*=} shift ;; + --tc_size=*) + TC_SIZE=${i#*=} + shift + ;; + --tc_num=*) + TC_NUM=${i#*=} + shift + ;; --help) show_help exit 0 @@ -180,7 +190,7 @@ then fi CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS" - +CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DTC_NUM=$TC_NUM -DTC_SIZE=$TC_SIZE $L2 $L3 $PERF_FLAG $CONFIGS" echo "CONFIGS=$CONFIGS" if [ $REBUILD -ne 0 ] diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 7fc8d1464..651234768 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -111,6 +111,24 @@ `endif `define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE) +`ifndef TC_SIZE +`define TC_SIZE 4 +`endif + +`ifndef TC_NUM +`define TC_NUM 1 +`endif + +// Number of TCU units +`ifndef NUM_TCU_LANES +`define NUM_TCU_LANES `TC_NUM +`endif + +// Number of TCU units +`ifndef NUM_TCU_BLOCKS +`define NUM_TCU_BLOCKS `ISSUE_WIDTH +`endif + `ifdef L2_ENABLE `define L2_ENABLED 1 `else diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index e744a26f9..06929b058 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -196,4 +196,7 @@ `define VX_CSR_NUM_CORES 12'hFC2 `define VX_CSR_LOCAL_MEM_BASE 12'hFC3 +`define VX_MAT_MUL_SIZE 12'hFC4 + + `endif // VX_TYPES_VH diff --git a/kernel/include/vx_intrinsics.h b/kernel/include/vx_intrinsics.h index 6000065e9..b67a770da 100644 --- a/kernel/include/vx_intrinsics.h +++ b/kernel/include/vx_intrinsics.h @@ -221,6 +221,26 @@ inline void vx_fence() { __asm__ volatile ("fence iorw, iorw"); } +//Matrix load +//Converted instruction type cause destination registers were not getiing blocked otherwise +inline void mload(unsigned dest, unsigned addr) +{ + asm volatile (".insn i 0x7b, 0, x0, %0(%1)" :: "i"(dest), "r"(addr)); +} + +//mat store +inline void ms(unsigned addr) +{ + asm volatile (".insn i 0x7b, 1, x0, 0(%0)" :: "r"(addr)); +} + +//mat mul +//num tiles along reduced K dimension of matmul as imm value (can use rd,rs field to expand range of n_tiles from 12 bits) +inline void mm() +{ + asm volatile (".insn i 0x7b, 2, x0, 0(x0)"); +} + #ifdef __cplusplus } #endif diff --git a/runtime/include/vortex.h b/runtime/include/vortex.h index c9dd6ec36..f1a412b81 100644 --- a/runtime/include/vortex.h +++ b/runtime/include/vortex.h @@ -34,6 +34,8 @@ typedef void* vx_buffer_h; #define VX_CAPS_GLOBAL_MEM_SIZE 0x5 #define VX_CAPS_LOCAL_MEM_SIZE 0x6 #define VX_CAPS_ISA_FLAGS 0x7 +#define VX_CAPS_TC_SIZE 0x8 +#define VX_CAPS_TC_NUM 0x9 // device isa flags #define VX_ISA_STD_A (1ull << 0) diff --git a/runtime/simx/vortex.cpp b/runtime/simx/vortex.cpp index 89856f3a0..f65d7b385 100644 --- a/runtime/simx/vortex.cpp +++ b/runtime/simx/vortex.cpp @@ -32,7 +32,7 @@ using namespace vortex; class vx_device { public: vx_device() - : arch_(NUM_THREADS, NUM_WARPS, NUM_CORES) + : arch_(NUM_THREADS, NUM_WARPS, NUM_CORES, TC_SIZE, TC_NUM) , ram_(0, RAM_PAGE_SIZE) , processor_(arch_) , global_mem_(ALLOC_BASE_ADDR, @@ -69,6 +69,12 @@ class vx_device { case VX_CAPS_NUM_CORES: _value = NUM_CORES * NUM_CLUSTERS; break; + case VX_CAPS_TC_SIZE: + _value = TC_SIZE; + break; + case VX_CAPS_TC_NUM: + _value = TC_NUM; + break; case VX_CAPS_CACHE_LINE_SIZE: _value = CACHE_BLOCK_SIZE; break; diff --git a/sim/simx/arch.h b/sim/simx/arch.h index 2507bf28f..e35687dbd 100644 --- a/sim/simx/arch.h +++ b/sim/simx/arch.h @@ -35,9 +35,11 @@ class Arch { uint16_t num_barriers_; uint16_t ipdom_size_; uint64_t local_mem_base_; + uint16_t tc_size_; + uint16_t tc_num_; public: - Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores) + Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint64_t tc_size, uint64_t tc_num) : num_threads_(num_threads) , num_warps_(num_warps) , num_cores_(num_cores) @@ -49,6 +51,8 @@ class Arch { , num_barriers_(NUM_BARRIERS) , ipdom_size_((num_threads-1) * 2) , local_mem_base_(LMEM_BASE_ADDR) + , tc_size_ (tc_size) + , tc_num_ (tc_num) {} uint16_t vsize() const { @@ -94,6 +98,15 @@ class Arch { uint16_t socket_size() const { return socket_size_; } + + uint16_t tc_size() const { + return tc_size_; + } + + uint16_t tc_num() const { + return tc_num_; + } + }; } \ No newline at end of file diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp index 0bd72524d..7020cf8ff 100644 --- a/sim/simx/core.cpp +++ b/sim/simx/core.cpp @@ -105,12 +105,14 @@ Core::Core(const SimContext& ctx, dispatchers_.at((int)FUType::FPU) = SimPlatform::instance().create_object(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES); dispatchers_.at((int)FUType::LSU) = SimPlatform::instance().create_object(arch, 2, NUM_LSU_BLOCKS, NUM_LSU_LANES); dispatchers_.at((int)FUType::SFU) = SimPlatform::instance().create_object(arch, 2, NUM_SFU_BLOCKS, NUM_SFU_LANES); - + dispatchers_.at((int)FUType::TCU) = SimPlatform::instance().create_object(arch, 2, NUM_TCU_BLOCKS, NUM_TCU_LANES); + // initialize execute units func_units_.at((int)FUType::ALU) = SimPlatform::instance().create_object(this); func_units_.at((int)FUType::FPU) = SimPlatform::instance().create_object(this); func_units_.at((int)FUType::LSU) = SimPlatform::instance().create_object(this); func_units_.at((int)FUType::SFU) = SimPlatform::instance().create_object(this); + func_units_.at((int)FUType::TCU) = SimPlatform::instance().create_object(this); // bind commit arbiters for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) { diff --git a/sim/simx/core.h b/sim/simx/core.h index cc0e46c8c..0b82de84a 100644 --- a/sim/simx/core.h +++ b/sim/simx/core.h @@ -170,6 +170,7 @@ class Core : public SimObject { friend class AluUnit; friend class FpuUnit; friend class SfuUnit; + friend class TcuUnit; }; } // namespace vortex diff --git a/sim/simx/decode.cpp b/sim/simx/decode.cpp index f934524c3..4d8d0a105 100644 --- a/sim/simx/decode.cpp +++ b/sim/simx/decode.cpp @@ -51,6 +51,7 @@ static const std::unordered_map sc_instTable = { {Opcode::EXT2, InstType::R4}, {Opcode::R_W, InstType::R}, {Opcode::I_W, InstType::I}, + {Opcode::TCU, InstType::I}, }; enum Constants { @@ -405,6 +406,16 @@ static const char* op_string(const Instr &instr) { default: std::abort(); } + + case Opcode::TCU: + switch(func3) + { + case 0: return "ML"; // + case 1: return "MS"; // + case 2: return "MATMUL"; + default: + std::abort(); + } default: std::abort(); } @@ -543,6 +554,14 @@ std::shared_ptr Emulator::decode(uint32_t code) const { case InstType::I: { switch (op) { + case Opcode::TCU: { + instr->setDestReg(rs1, RegType::Integer); + instr->addSrcReg(rs1, RegType::Integer); + instr->setFunc3(func3); + instr->setFunc7(func7); + auto imm = code >> shift_rs2; + instr->setImm(sext(imm, width_i_imm)); + } break; case Opcode::I: case Opcode::I_W: case Opcode::JALR: diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp index 5850bfd56..ea5f72c42 100644 --- a/sim/simx/emulator.cpp +++ b/sim/simx/emulator.cpp @@ -74,6 +74,7 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core) , core_(core) , warps_(arch.num_warps(), arch) , barriers_(arch.num_barriers(), 0) + , scratchpad(std::vector(core->arch().tc_size() * core->arch().tc_size() * 32768)) //Fix this { this->clear(); } @@ -110,6 +111,11 @@ void Emulator::clear() { active_warps_.set(0); warps_[0].tmask.set(0); wspawn_.valid = false; + + for (auto& reg : scratchpad) + { + reg = 0; + } } void Emulator::attach_ram(RAM* ram) { @@ -344,6 +350,11 @@ void Emulator::cout_flush() { case (addr + (VX_CSR_MPM_BASE_H-VX_CSR_MPM_BASE)) : return ((value >> 32) & 0xFFFFFFFF) #endif +Word Emulator::get_tiles() +{ + return mat_size; +} + Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { auto core_perf = core_->perf_stats(); switch (addr) { @@ -375,6 +386,8 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_NUM_CORES: return uint32_t(arch_.num_cores()) * arch_.num_clusters(); case VX_CSR_LOCAL_MEM_BASE: return arch_.local_mem_base(); case VX_CSR_MSCRATCH: return csr_mscratch_; + case VX_MAT_MUL_SIZE: return mat_size; + CSR_READ_64(VX_CSR_MCYCLE, core_perf.cycles); CSR_READ_64(VX_CSR_MINSTRET, core_perf.instrs); default: @@ -484,6 +497,9 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) { case VX_CSR_MNSTATUS: case VX_CSR_MCAUSE: break; + case VX_MAT_MUL_SIZE: + mat_size = value; + break; default: { std::cout << std::hex << "Error: invalid CSR write addr=0x" << addr << ", value=0x" << value << std::endl; std::abort(); @@ -500,4 +516,4 @@ void Emulator::update_fcrs(uint32_t fflags, uint32_t tid, uint32_t wid) { this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, tid, wid) | fflags, tid, wid); this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, tid, wid) | fflags, tid, wid); } -} \ No newline at end of file +} diff --git a/sim/simx/emulator.h b/sim/simx/emulator.h index 81dcecd83..82b5bc98b 100644 --- a/sim/simx/emulator.h +++ b/sim/simx/emulator.h @@ -53,6 +53,8 @@ class Emulator { bool wspawn(uint32_t num_warps, Word nextPC); int get_exitcode() const; + + Word get_tiles(); private: @@ -121,6 +123,8 @@ class Emulator { MemoryUnit mmu_; Word csr_mscratch_; wspawn_t wspawn_; + std::vector scratchpad; + uint32_t mat_size; }; } diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp index e0fc2b94a..d522145db 100644 --- a/sim/simx/execute.cpp +++ b/sim/simx/execute.cpp @@ -25,6 +25,7 @@ #include "emulator.h" #include "instr.h" #include "core.h" +#include "VX_types.h" using namespace vortex; @@ -1414,6 +1415,184 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { std::abort(); } } break; + case Opcode::TCU: + { //TODO - make it data-type flexible + uint32_t mem_bytes = 1; + DP(3, "mem_bytes=" << mem_bytes << std::endl); + uint16_t tc_size = core_->arch().tc_size(); + uint32_t TC_per_warp = core_->arch().tc_num(); + + //Number of loads - dependant on the thread config + uint32_t n_tiles = this->get_csr(VX_MAT_MUL_SIZE, 0, wid); //CSR instruction before MLOAD will ensure that this csr has value + int num_data_per_thread; + int num_data_per_thread_st; + int num_threads_actv; + int num_threads_actv_st; + uint32_t data_bytes_load; + uint32_t data_bytes_store; + uint32_t num_threads_per_tc = MAX (1, num_threads/TC_per_warp); + + //int num_warps = MIN() + //int active_tcs = MIN (TC_per_warp, num_output_tiles/num_warps) + //LOAD + if(num_threads > tc_size*tc_size*n_tiles*TC_per_warp) + { + num_threads_actv = tc_size*tc_size*n_tiles*TC_per_warp; + num_data_per_thread = 1; + } + else + { + num_threads_actv = num_threads; + num_data_per_thread = (tc_size*tc_size*n_tiles)/num_threads_per_tc; + } + data_bytes_load = mem_bytes*num_data_per_thread; + + //STORE + + // DP(3, "DEBUG :: num_threads = " << num_threads); + // DP(3, "DEBUG :: tc_size*tc_size = " << tc_size*tc_size); + //DP(3, "imm = " << immsrc); + + if(num_threads > tc_size*tc_size*TC_per_warp) + { + num_threads_actv_st = tc_size*tc_size*TC_per_warp; + num_data_per_thread_st = 1; + } + else + { + num_threads_actv_st = num_threads; + num_data_per_thread_st = (tc_size*tc_size)/num_threads_per_tc; + } + data_bytes_store = mem_bytes*num_data_per_thread_st; + + DP(3, "Num Tiles=" << n_tiles << std::endl); + + switch (func3) { + case 0: + { //Matrix Load + + DP (4, "TCU LOAD"); + trace->fu_type = FUType::LSU; + trace->lsu_type = LsuType::TCU_LOAD; + + trace->used_iregs.set(rsrc0); + auto trace_data = std::make_shared(num_threads); + trace->data = trace_data; + + for (uint32_t t = thread_start; t < num_threads_actv; ++t) + { + if (!warp.tmask.test(t)) + continue; + DP(3, "Thread ID" << t); + + uint32_t base_addr = rsdata[t][0].i ; + trace_data->mem_addrs.at(t) = {base_addr, data_bytes_load}; + + //Load A or B (depends on immsrc) + int loop_offset = 0; + DP(3, "n_tiles = " << n_tiles << "; num_data_per_thread = " << num_data_per_thread <dcache_read(temp_ref, (base_addr+(n*mem_bytes)+(loop_offset*mem_bytes)), mem_bytes); + + scratchpad[loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n] = *temp_ref; + DP(3, "Scratchpad Index: " << loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n << ", Value: " << scratchpad[loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n]); + } + //loop_offset += tc_size*tc_size; + //} + } + rd_write = true; + } break; + case 1: + { + DP(4, "TCU STORE"); + trace->fu_type = FUType::LSU; + trace->lsu_type = LsuType::TCU_STORE; + + auto trace_data = std::make_shared(num_threads); + trace->data = trace_data; + uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2; + + for (uint32_t t = thread_start; t < num_threads_actv_st; ++t) + { + if (!warp.tmask.test(t)) + continue; + + DP(3, "Thread ID" << t); + uint32_t base_addr = rsdata[t][0].i ; + + trace_data->mem_addrs.at(t) = {base_addr, data_bytes_store}; + + //Store C + for (int n=0; n csr (TODO :: can intermediate step of moving to CSR be skipped?) + //core_->set_csr(csr_addr[(2*num_data_per_thread) + n], scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread) + n], t, warp_id_); + Word* temp_ref = &(warp.ireg_file.at(t).at(rsrc0)); + *temp_ref = scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread_st) + n]; + + this->dcache_write(temp_ref, base_addr+(n*mem_bytes), mem_bytes); + } + } + //Clear the scratchpad + for(int i =0 ; i < scratchpad.size(); i++) + { + scratchpad[i] = 0; + } + } + break; + case 2: + { //Matrix Multiply + DP(4, "TCU MULTIPLY MAT"); + trace->fu_type = FUType::TCU; + trace->tcu_type = TCUType::TCU_MUL; + uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2; + uint32_t threads_per_tc = MAX (1, num_threads/TC_per_warp); + for (uint32_t t = thread_start; t < num_threads_actv; ++t) + { + if (!warp.tmask.test(t)) + continue; + + DP(3, "Thread ID" << t); + //TC operation [only 1 thread in 1 warp needs to do this] + if (t%threads_per_tc == 0) + { + //TODO - change to systolic array implementation + uint32_t thread_offset = t*(tc_size*tc_size); + int loop_offset = 0; + int offset_b = n_tiles*n_tiles*n_tiles*tc_size*tc_size; + // Loop over all tiles - output stationary + //for(int tiles = 0 ; tiles < n_tiles ; tiles++) //What's the HW implication of this?? A counter implementation? + //{ + /* + for (int i = 0; i < tc_size; i++) { //ROW-1 + for (int j = 0; j < tc_size; j++) { //COL-2 + int sum = 0; + for (int k = 0; k < tc_size; k++) + { //COL-1 + sum = sum + scratchpad[loop_offset + thread_offset*n_tiles + i * tc_size + k] *scratchpad[loop_offset + thread_offset*n_tiles + offset_b + (k * tc_size + j)]; + } + scratchpad[accu_offset + thread_offset +(i * tc_size + j)] += sum; //[i * col2 + j] = sum + DP(3, "Scratchpad Index: " << accu_offset + (i * tc_size + j) << " , Value=" << scratchpad[accu_offset + (i * tc_size + j)]); + + } + } + */ + //loop_offset += tc_size*tc_size; //Move to the next tiled matmul fragment + //} + } + } + + }break; + default: + std::abort(); + } + } break; default: std::abort(); } diff --git a/sim/simx/func_unit.cpp b/sim/simx/func_unit.cpp index c9a3f0fc7..3991a17e7 100644 --- a/sim/simx/func_unit.cpp +++ b/sim/simx/func_unit.cpp @@ -21,6 +21,7 @@ #include "core.h" #include "constants.h" #include "cache_sim.h" +#include "VX_types.h" using namespace vortex; @@ -162,7 +163,7 @@ void LsuUnit::tick() { continue; } - bool is_write = (trace->lsu_type == LsuType::STORE); + bool is_write = ((trace->lsu_type == LsuType::STORE) || (trace->lsu_type == LsuType::TCU_STORE)); // check pending queue capacity if (!is_write && state.pending_rd_reqs.full()) { @@ -175,13 +176,14 @@ void LsuUnit::tick() { } uint32_t tag = 0; + if (!is_write) { tag = state.pending_rd_reqs.allocate({trace, 0}); } // send memory request auto num_reqs = this->send_requests(trace, block_idx, tag); - + if (!is_write) { state.pending_rd_reqs.at(tag).count = num_reqs; } @@ -200,7 +202,14 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) { int count = 0; auto trace_data = std::dynamic_pointer_cast(trace->data); - bool is_write = (trace->lsu_type == LsuType::STORE); + bool is_write = ((trace->lsu_type == LsuType::STORE) || (trace->lsu_type == LsuType::TCU_STORE)); + + uint16_t req_per_thread = 1; + if ((trace->lsu_type == LsuType::TCU_LOAD) || (trace->lsu_type == LsuType::TCU_STORE)) + { + req_per_thread= (1>(trace_data->mem_addrs.at(0).size)/4)? 1: ((trace_data->mem_addrs.at(0).size)/4); + } + auto t0 = trace->pid * NUM_LSU_LANES; for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) { @@ -213,33 +222,69 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) { auto mem_addr = trace_data->mem_addrs.at(t); auto type = get_addr_type(mem_addr.addr); - - MemReq mem_req; - mem_req.addr = mem_addr.addr; - mem_req.write = is_write; - mem_req.type = type; - mem_req.tag = tag; - mem_req.cid = trace->cid; - mem_req.uuid = trace->uuid; - - dcache_req_port.push(mem_req, 1); - DT(3, "mem-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag - << ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace); - - if (is_write) { - ++core_->perf_stats_.stores; - } else { - ++core_->perf_stats_.loads; - ++pending_loads_; + // DT(3, "addr_type = " << type << ", " << *trace); + uint32_t mem_bytes = 1; + for (int i = 0; i < req_per_thread; i++) + { + MemReq mem_req; + mem_req.addr = mem_addr.addr + (i*mem_bytes); + mem_req.write = is_write; + mem_req.type = type; + mem_req.tag = tag; + mem_req.cid = trace->cid; + mem_req.uuid = trace->uuid; + + dcache_req_port.push(mem_req, 1); + DT(3, "mem-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag + << ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace); + + if (is_write) { + ++core_->perf_stats_.stores; + } else { + ++core_->perf_stats_.loads; + ++pending_loads_; + } + + ++count; } - - ++count; } return count; } /////////////////////////////////////////////////////////////////////////////// +TcuUnit::TcuUnit(const SimContext& ctx, Core* core) + : FuncUnit(ctx, core, "TCU") + , tc_size (core_->arch().tc_size()) + {} + +void TcuUnit::tick() { + + for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) { + auto& input = Inputs.at(i); + if (input.empty()) + continue; + auto& output = Outputs.at(i); + auto trace = input.front(); + uint32_t n_tiles = core_->emulator_.get_tiles(); + switch (trace->tcu_type) { + case TCUType::TCU_MUL: + { //mat size = n_tiles * tc_size + int matmul_latency = (n_tiles * tc_size) + tc_size + tc_size; + output.push(trace, matmul_latency); + DT(3, "matmul_latency = " << matmul_latency << ", " << *trace); + break; + } + default: + std::abort(); + } + DT(3, "pipeline-execute: op=" << trace->tcu_type << ", " << *trace); + input.pop(); + } +} + +/////////////////////////////////////////////////////////////////////////////// + SfuUnit::SfuUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "SFU") {} diff --git a/sim/simx/func_unit.h b/sim/simx/func_unit.h index 45f0152ff..5fc922991 100644 --- a/sim/simx/func_unit.h +++ b/sim/simx/func_unit.h @@ -100,6 +100,15 @@ class LsuUnit : public FuncUnit { /////////////////////////////////////////////////////////////////////////////// +class TcuUnit : public FuncUnit { +public: + TcuUnit(const SimContext& ctx, Core*); + uint64_t tc_size; + void tick(); +}; + +/////////////////////////////////////////////////////////////////////////////// + class SfuUnit : public FuncUnit { public: SfuUnit(const SimContext& ctx, Core*); diff --git a/sim/simx/instr.h b/sim/simx/instr.h index f97a19eac..061b4deb0 100644 --- a/sim/simx/instr.h +++ b/sim/simx/instr.h @@ -46,7 +46,7 @@ enum class Opcode { EXT1 = 0x0b, EXT2 = 0x2b, EXT3 = 0x5b, - EXT4 = 0x7b + TCU = 0x7b }; enum class InstType { diff --git a/sim/simx/instr_trace.h b/sim/simx/instr_trace.h index 532b736f5..9d6859fb7 100644 --- a/sim/simx/instr_trace.h +++ b/sim/simx/instr_trace.h @@ -75,6 +75,7 @@ struct instr_trace_t { AluType alu_type; FpuType fpu_type; SfuType sfu_type; + TCUType tcu_type; }; ITraceData::Ptr data; diff --git a/sim/simx/main.cpp b/sim/simx/main.cpp index 0f61de6f4..58eb96d61 100644 --- a/sim/simx/main.cpp +++ b/sim/simx/main.cpp @@ -35,6 +35,8 @@ static void show_usage() { uint32_t num_threads = NUM_THREADS; uint32_t num_warps = NUM_WARPS; uint32_t num_cores = NUM_CORES; +uint32_t tc_size = TC_SIZE; +uint32_t tc_num = TC_NUM; bool showStats = false; const char* program = nullptr; @@ -81,7 +83,7 @@ int main(int argc, char **argv) { { // create processor configuation - Arch arch(num_threads, num_warps, num_cores); + Arch arch(num_threads, num_warps, num_cores, tc_size, tc_num); // create memory module RAM ram(0, RAM_PAGE_SIZE); diff --git a/sim/simx/types.h b/sim/simx/types.h index a84216ae1..15623ce39 100644 --- a/sim/simx/types.h +++ b/sim/simx/types.h @@ -23,6 +23,7 @@ #include #include #include "debug.h" +#include namespace vortex { @@ -78,6 +79,7 @@ enum class FUType { LSU, FPU, SFU, + TCU, Count }; @@ -87,6 +89,7 @@ inline std::ostream &operator<<(std::ostream &os, const FUType& type) { case FUType::LSU: os << "LSU"; break; case FUType::FPU: os << "FPU"; break; case FUType::SFU: os << "SFU"; break; + case FUType::TCU: os << "TCU"; break; default: assert(false); } return os; @@ -118,14 +121,30 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) { enum class LsuType { LOAD, + TCU_LOAD, STORE, + TCU_STORE, FENCE }; +enum class TCUType { + TCU_MUL +}; + +inline std::ostream &operator<<(std::ostream &os, const TCUType& type) { + switch (type) { + case TCUType::TCU_MUL: os << "TCU MUL"; break; + default: assert(false); + } + return os; +} + inline std::ostream &operator<<(std::ostream &os, const LsuType& type) { switch (type) { case LsuType::LOAD: os << "LOAD"; break; + case LsuType::TCU_LOAD: os << "TCU_LOAD"; break; case LsuType::STORE: os << "STORE"; break; + case LsuType::TCU_STORE: os << "TCU_STORE"; break; case LsuType::FENCE: os << "FENCE"; break; default: assert(false); } @@ -383,7 +402,7 @@ class Mux : public SimObject> { , type_(type) , delay_(delay) , cursors_(num_outputs, 0) - , num_reqs_(num_inputs / num_outputs) + , num_reqs_(log2ceil(num_inputs / num_outputs)) { assert(delay != 0); assert(num_inputs <= 32); @@ -407,7 +426,7 @@ class Mux : public SimObject> { void tick() { uint32_t I = Inputs.size(); uint32_t O = Outputs.size(); - uint32_t R = num_reqs_; + uint32_t R = 1 << num_reqs_; // skip bypass mode if (I == O) diff --git a/tests/regression/matmul/Makefile b/tests/regression/matmul/Makefile new file mode 100644 index 000000000..7f1c48523 --- /dev/null +++ b/tests/regression/matmul/Makefile @@ -0,0 +1,14 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := matmul + +SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cpp + +VX_SRCS := $(SRC_DIR)/kernel.cpp + +OPTS ?= -n128 -d1 + +include ../common.mk diff --git a/tests/regression/matmul/common.h b/tests/regression/matmul/common.h new file mode 100644 index 000000000..a9aa5de6c --- /dev/null +++ b/tests/regression/matmul/common.h @@ -0,0 +1,17 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +typedef struct { + uint32_t num_tasks; + uint32_t num_warps; + uint32_t num_threads; + uint32_t TC_per_warp; + uint32_t matrix_size; + uint32_t data_size; + uint64_t tc_size; + uint64_t src0_addr; + uint64_t src1_addr; + uint64_t dst_addr; +} kernel_arg_t; + +#endif \ No newline at end of file diff --git a/tests/regression/matmul/kernel.cpp b/tests/regression/matmul/kernel.cpp new file mode 100644 index 000000000..eeb902acb --- /dev/null +++ b/tests/regression/matmul/kernel.cpp @@ -0,0 +1,124 @@ +#include +#include +#include +#include "common.h" + +void kernel_body(kernel_arg_t* __UNIFORM__ arg) { + uint32_t task_id = blockIdx.x; + int32_t* src0_ptr = (int32_t*)arg->src0_addr; + int32_t* src1_ptr = (int32_t*)arg->src1_addr; + int32_t* dst_ptr = (int32_t*)arg->dst_addr; + unsigned a_addr = reinterpret_cast(src0_ptr); + unsigned b_addr = reinterpret_cast(src1_ptr); + unsigned c_addr = reinterpret_cast(dst_ptr); + + uint32_t tc_size = arg->tc_size; + int TC_per_warp = arg->TC_per_warp; + unsigned num_threads = arg->num_threads; + int num_warps = arg->num_warps; + uint32_t matrix_size = arg->matrix_size; + + int n_tiles = matrix_size/tc_size; + int num_output_tiles = (matrix_size*matrix_size)/(tc_size*tc_size); + + int num_tasks = arg->num_tasks; + + //Assuming matrix size always > tensor core size + int warps_actual; + if (TC_per_warp > num_output_tiles) + warps_actual = 1; + else + warps_actual = num_output_tiles/TC_per_warp; + + int num_warps_actual = (warps_actual < num_warps)? warps_actual: num_warps; + int num_threads_per_tc = (1> num_threads/TC_per_warp)? 1: num_threads/TC_per_warp; + + int num_tasks_per_thread = (1> (num_tasks/(num_threads*num_warps_actual)))? 1: (num_tasks/(num_threads*num_warps_actual)); + int num_tasks_per_warp = (1 > num_tasks/num_warps_actual)? 1:num_tasks/num_warps_actual; + int task_id_first_warp = task_id%num_tasks_per_warp; + + //A&B + int num_data_per_op_tile = tc_size*tc_size*n_tiles; + int num_data_per_warp = num_data_per_op_tile*((1> (num_output_tiles/num_warps_actual))?1:(num_output_tiles/num_warps_actual)); + + int addr_shift; + if (((tc_size*tc_size*n_tiles)/(num_threads)) > 1) + addr_shift = (tc_size*tc_size*n_tiles)/(num_threads); + else + addr_shift = 1; + //Offset for 1st warp + int offset = ((task_id_first_warp/num_tasks_per_thread)*addr_shift) + ((task_id_first_warp%num_tasks_per_thread)*num_data_per_op_tile); + offset = offset + (num_data_per_warp*(task_id/num_tasks_per_warp)); + + //C + int num_data_per_op_tile_c = tc_size*tc_size; + int num_data_per_warp_c = num_data_per_warp/n_tiles; + + int addr_shift_c; + if (((tc_size*tc_size)/(num_threads)) > 1) + addr_shift_c = tc_size; + else + addr_shift_c = 1; + //Offset for 1st warp + int offset_c = ((task_id_first_warp/num_tasks_per_thread)*addr_shift_c) + ((task_id_first_warp%num_tasks_per_thread)*num_data_per_op_tile_c); + offset_c = offset_c + (num_data_per_warp_c*(task_id/num_tasks_per_warp)); + + int thread_limit = (num_threads < tc_size*tc_size*n_tiles*TC_per_warp)? num_threads : tc_size*tc_size*n_tiles*TC_per_warp; + int thread_limit_c = (num_threads 64 tasks => 32 tasks/warp => 8 tasks/thread + /*task0->thread0, warp0 + task1->thread0 , warp0 + task2->thread0 , warp0 + . + task7->thread0 + task8->thread1 + task9->thread1 + . + . + ------ + task32 -> thread0, warp1 + task33 -> thread1, warp1 + . + */ + + //NEW TASK DISTRIBUTION // For 8x8 matrix, 2x2 tc_size, 1 tc_num, 4threads, 2warps => 64 tasks => 32 tasks/warp => 8 tasks/thread + /*task0->thread0, warp0 + task1->thread1 , warp0 + task2->thread2 , warp0 + task3->thread3 ,... + task4->thread0 + task5->thread1 + . + . + ------ + task32 -> thread0, warp1 + task33 -> thread1, warp1 + . + .*/ + + //TODO :: change this for new task->thread distribution + if (((task_id%num_tasks_per_warp)/num_tasks_per_thread) < thread_limit) + { + unsigned a_addr_base = a_addr + offset*arg->data_size; + unsigned b_addr_base = b_addr + offset*arg->data_size; + unsigned c_addr_base = c_addr + offset_c*arg->data_size; + csr_write(VX_MAT_MUL_SIZE,n_tiles); + mload (0, a_addr_base); + mload (1, b_addr_base); + //In case of multiple threads - sync load + vx_fence(); + + mm(); //Assuming padding to ensure matrix size is a multiple of tc_size + vx_fence(); + if (((task_id%num_tasks_per_warp)/num_tasks_per_thread) < thread_limit_c) + ms(c_addr_base); + //In case of multiple threads - sync store + vx_fence(); + } +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); + return vx_spawn_threads(1, &arg->num_tasks, nullptr, (vx_kernel_func_cb)kernel_body, arg); +} diff --git a/tests/regression/matmul/main.cpp b/tests/regression/matmul/main.cpp new file mode 100644 index 000000000..6a86712ae --- /dev/null +++ b/tests/regression/matmul/main.cpp @@ -0,0 +1,343 @@ +#include +#include +#include +#include +#include +#include +#include +#include "common.h" + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.vxbin"; +uint32_t matrix_size = 0; +vx_device_h device = nullptr; +vx_buffer_h A_buffer = nullptr; +vx_buffer_h B_buffer = nullptr; +vx_buffer_h C_buffer = nullptr; +vx_buffer_h krnl_buffer = nullptr; +vx_buffer_h args_buffer = nullptr; + +std::vector staging_buf; +kernel_arg_t kernel_arg = {}; + +static void show_usage() { + std::cout << "Vortex Test." << std::endl; + std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv, uint32_t &data_size) { + int c; + while ((c = getopt(argc, argv, "n:k:d:h?")) != -1) { + switch (c) { + case 'n': + matrix_size = atoi(optarg); + break; + case 'k': + kernel_file = optarg; + break; + case 'd': + data_size = atoi(optarg); + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (device) { + vx_mem_free(A_buffer); + vx_mem_free(B_buffer); + vx_mem_free(C_buffer); + vx_mem_free(krnl_buffer); + vx_mem_free(args_buffer); + vx_dev_close(device); + } +} + +template +class mainVariables +{ + public: + // Constructor + mainVariables(uint32_t bufSize, uint32_t dataSize, uint32_t matrixSize) + : buf_size(bufSize), data_size(dataSize), matrix_size(matrixSize) + { + // Resize vectors to specified sizes + src_A.resize(buf_size/data_size); + src_B.resize(buf_size/data_size); + refs.resize(buf_size/data_size); + } + + void init_inputs () + { + std::cout << "inside init" << std::endl; + for (uint32_t i = 0; i < matrix_size*matrix_size; ++i) + { + auto a = static_cast(std::rand()) / RAND_MAX; + auto b = static_cast(std::rand()) / RAND_MAX; + src_A[i] = static_cast(a * matrix_size); + src_B[i] = static_cast(b * matrix_size); + } + } + + void matmul_cpu() + { + for (uint32_t row = 0; row < matrix_size; ++row) + { + for (uint32_t col = 0; col < matrix_size; ++col) + { + TYPE sum(0); + for (uint32_t e = 0; e < matrix_size; ++e) { + sum += src_A[row * matrix_size + e] * src_B[e * matrix_size + col]; + } + refs[row * matrix_size + col] = sum; + } + } + } + + //Public variables + std::vector src_A; + std::vector src_B; + std::vector refs; + + std::vector A_mat; + std::vector B_mat; + + private: + uint32_t buf_size; + uint32_t data_size; + uint32_t matrix_size; +}; + + + +int main(int argc, char *argv[]) { + // parse command arguments + uint32_t data_size = 0; + parse_args(argc, argv, data_size); + if (matrix_size == 0) { + matrix_size = 2; + } + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + uint64_t num_cores, num_warps, num_threads, tc_size, TC_per_warp; + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp)); + + std::cout << "Debug :: tc_size = " << tc_size << std::endl; + std::cout << "Debug :: tc_num = " << TC_per_warp << std::endl; + + int threads_per_tc; + //TODO - can be changed + //Number of output tiles * number of threads + if (TC_per_warp > num_threads) + threads_per_tc = 1; + else + threads_per_tc = num_threads/TC_per_warp; + + uint32_t num_tasks = ((matrix_size*matrix_size)/(tc_size*tc_size))*threads_per_tc; + + //size of each operand + uint32_t buf_size = ((matrix_size*matrix_size)/(tc_size*tc_size))*(matrix_size/(tc_size))*(tc_size*tc_size)*data_size; + + //256 + std::cout << "Debug :: buf_size: " << buf_size << " bytes" << std::endl; + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &A_buffer)); + RT_CHECK(vx_mem_address(A_buffer, &kernel_arg.src0_addr)); + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &B_buffer)); + RT_CHECK(vx_mem_address(B_buffer, &kernel_arg.src1_addr)); + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &C_buffer)); + RT_CHECK(vx_mem_address(C_buffer, &kernel_arg.dst_addr)); + + std::cout << "A_addr=0x" << std::hex << kernel_arg.src0_addr << std::endl; + std::cout << "B_addr=0x" << std::hex << kernel_arg.src1_addr << std::endl; + std::cout << "C_addr=0x" << std::hex << kernel_arg.dst_addr << std::endl; + + mainVariables variables (buf_size, data_size, matrix_size); + variables.init_inputs(); + + ////////////////////////////////////////////////// + // generate source data + ////////////////////////////////////////////////// + variables.matmul_cpu(); + + uint32_t tc_size_f = tc_size*tc_size; + uint32_t n_tiles = matrix_size/tc_size; + + variables.A_mat.resize(buf_size); + variables.B_mat.resize(buf_size); + + //Demand matrix creation for A / traverse through the rows + for(uint32_t k=0; k(time_end - time_start).count(); + printf("Elapsed time: %lg ms\n", elapsed); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev((int8_t*)variables.B_mat.data(), C_buffer, 0, buf_size)); + + // verify result (TODO : needs to be fixed for for functional correctness) + /* + std::cout << "verify result" << std::endl; + { + int errors = 0; + auto buf_ptr = (int8_t*)staging_buf.data(); + uint64_t tc_size = kernel_arg.tc_size; + std::cout << "tc_size = " << tc_size << std::endl; + int Result[matrix_size*matrix_size]; + int n_tiles = (matrix_size/tc_size); + int tc_size_f = tc_size*tc_size; + + //converting buf ptr (tile by tile) to CPU style linear (row by row) + for(int k = 0; k < matrix_size/tc_size; k+= 1) + { + for(int j = 0; j < matrix_size; j+= tc_size) + { + for(int i =0; i < tc_size*tc_size; i++) + { + Result[ tc_size*matrix_size*k +j+ (i/tc_size)*matrix_size +i%(tc_size)] = buf_ptr[matrix_size*tc_size*k+tc_size*j+i]; + } + } + } + + for (uint32_t i = 0; i < matrix_size*matrix_size; ++i) { + //int ref = i + i; + int cur = Result[i]; + if (cur != refs[i]) { + ++errors; + } + } + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + else + { + std::cout << "CONDITIONALLY PASSED!" << std::endl; + } + } + */ + + // cleanup + std::cout << "cleanup" << std::endl; + cleanup(); + + std::cout << "PASSED!" << std::endl; + + return 0; +} \ No newline at end of file From 0e3badf723c9e2d03013237433aa0838b21c7f99 Mon Sep 17 00:00:00 2001 From: Varsha Singhania Date: Tue, 18 Jun 2024 02:19:57 -0400 Subject: [PATCH 2/7] Script checkin and code cleanup --- run_final.sh | 22 ++++++++++++++++++++++ sim/simx/execute.cpp | 28 +++++++++------------------- 2 files changed, 31 insertions(+), 19 deletions(-) create mode 100755 run_final.sh diff --git a/run_final.sh b/run_final.sh new file mode 100755 index 000000000..5f618dc64 --- /dev/null +++ b/run_final.sh @@ -0,0 +1,22 @@ +# Define arrays for threads, warps, and matrix sizes +matrix_sizes=(16 32 64 128 256 512) +tcsizes=(8 16 32) +tcnums=(4 8 16 32) +#lsulanes=(4 16) +#cores=(32) + + +# Loop through each combination of threads and warps +for size in "${matrix_sizes[@]}"; do + sed -i "s/OPTS ?= -n[0-9]\+/OPTS ?= -n${size}/" ../tests/regression/matmul/Makefile + sed -i "s/OPTS ?= -n[0-9]\+/OPTS ?= -n${size}/" tests/regression/matmul/Makefile + echo "Matrix size changed to ${size} in Makefile" + for tcsize in "${tcsizes[@]}"; do + for tcnum in "${tcnums[@]}"; do + log_name="sim_final/mat${size}/tcsize${tcsize}_tcnum${tcnum}_32w32t" + command="./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --tc_size=${tcsize} --tc_num=${tcnum} --rebuild=1 --perf=1 > ${log_name} 2>&1" + echo "$command" + eval "$command" + done + done +done diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp index d522145db..e13df18b9 100644 --- a/sim/simx/execute.cpp +++ b/sim/simx/execute.cpp @@ -1432,8 +1432,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { uint32_t data_bytes_store; uint32_t num_threads_per_tc = MAX (1, num_threads/TC_per_warp); - //int num_warps = MIN() - //int active_tcs = MIN (TC_per_warp, num_output_tiles/num_warps) //LOAD if(num_threads > tc_size*tc_size*n_tiles*TC_per_warp) { @@ -1448,11 +1446,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { data_bytes_load = mem_bytes*num_data_per_thread; //STORE - - // DP(3, "DEBUG :: num_threads = " << num_threads); - // DP(3, "DEBUG :: tc_size*tc_size = " << tc_size*tc_size); - //DP(3, "imm = " << immsrc); - if(num_threads > tc_size*tc_size*TC_per_warp) { num_threads_actv_st = tc_size*tc_size*TC_per_warp; @@ -1499,8 +1492,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { scratchpad[loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n] = *temp_ref; DP(3, "Scratchpad Index: " << loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n << ", Value: " << scratchpad[loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n]); } - //loop_offset += tc_size*tc_size; - //} } rd_write = true; } break; @@ -1531,7 +1522,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { uint32_t csr_index = (2*num_data_per_thread_st) + n; uint32_t scratchpad_index = (tc_size*tc_size*2) + (t*num_data_per_thread) + n; - //scratchpad -> csr (TODO :: can intermediate step of moving to CSR be skipped?) + //scratchpad -> csr (TODO :: removed intermediate CSR stage ; incorporate limited scratchmad implementation) //core_->set_csr(csr_addr[(2*num_data_per_thread) + n], scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread) + n], t, warp_id_); Word* temp_ref = &(warp.ireg_file.at(t).at(rsrc0)); *temp_ref = scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread_st) + n]; @@ -1562,14 +1553,14 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { //TC operation [only 1 thread in 1 warp needs to do this] if (t%threads_per_tc == 0) { - //TODO - change to systolic array implementation + //TODO : change to systolic array implementation uint32_t thread_offset = t*(tc_size*tc_size); int loop_offset = 0; int offset_b = n_tiles*n_tiles*n_tiles*tc_size*tc_size; - // Loop over all tiles - output stationary - //for(int tiles = 0 ; tiles < n_tiles ; tiles++) //What's the HW implication of this?? A counter implementation? - //{ - /* + /* + // TODO : Fix needed for functional correctness + for(int tiles = 0 ; tiles < n_tiles ; tiles++) //What's the HW implication of this?? A counter implementation? + { for (int i = 0; i < tc_size; i++) { //ROW-1 for (int j = 0; j < tc_size; j++) { //COL-2 int sum = 0; @@ -1579,12 +1570,11 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { } scratchpad[accu_offset + thread_offset +(i * tc_size + j)] += sum; //[i * col2 + j] = sum DP(3, "Scratchpad Index: " << accu_offset + (i * tc_size + j) << " , Value=" << scratchpad[accu_offset + (i * tc_size + j)]); - } } - */ - //loop_offset += tc_size*tc_size; //Move to the next tiled matmul fragment - //} + loop_offset += tc_size*tc_size; //Move to the next tiled matmul fragment + } + */ } } From a378aed67cc7d891201124eb5b51059ec3989252 Mon Sep 17 00:00:00 2001 From: Nayan Sivakumar Nair Date: Fri, 21 Jun 2024 22:23:24 -0400 Subject: [PATCH 3/7] Moved tc_num, tc_size param to makefile args --- ci/blackbox.sh | 10 +--------- ci/regression.sh.in | 1 + hw/rtl/VX_types.vh | 3 +++ runtime/simx/vortex.cpp | 14 +++++++------- sim/simx/arch.h | 14 +------------- sim/simx/emulator.cpp | 16 +++++++++++++++- sim/simx/emulator.h | 3 +++ sim/simx/execute.cpp | 7 +++++-- sim/simx/func_unit.cpp | 4 +++- sim/simx/func_unit.h | 2 +- sim/simx/main.cpp | 2 +- tests/regression/matmul/Makefile | 2 +- tests/regression/matmul/kernel.cpp | 5 ++++- tests/regression/matmul/main.cpp | 29 ++++++++++++++++++++++++----- 14 files changed, 70 insertions(+), 42 deletions(-) diff --git a/ci/blackbox.sh b/ci/blackbox.sh index 8a04133f9..defad4c05 100755 --- a/ci/blackbox.sh +++ b/ci/blackbox.sh @@ -114,14 +114,6 @@ case $i in LOGFILE=${i#*=} shift ;; - --tc_size=*) - TC_SIZE=${i#*=} - shift - ;; - --tc_num=*) - TC_NUM=${i#*=} - shift - ;; --help) show_help exit 0 @@ -190,7 +182,7 @@ then fi CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS" -CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DTC_NUM=$TC_NUM -DTC_SIZE=$TC_SIZE $L2 $L3 $PERF_FLAG $CONFIGS" +# CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DTC_NUM=$TC_NUM -DTC_SIZE=$TC_SIZE $L2 $L3 $PERF_FLAG $CONFIGS" echo "CONFIGS=$CONFIGS" if [ $REBUILD -ne 0 ] diff --git a/ci/regression.sh.in b/ci/regression.sh.in index a5f1bffdb..50d309af6 100755 --- a/ci/regression.sh.in +++ b/ci/regression.sh.in @@ -124,6 +124,7 @@ regression() # test local barrier ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -tbar" + echo "regression tests done!" } diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index 06929b058..9a8f93234 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -197,6 +197,9 @@ `define VX_CSR_LOCAL_MEM_BASE 12'hFC3 `define VX_MAT_MUL_SIZE 12'hFC4 +`define VX_TC_NUM 12'hFC5 +`define VX_TC_SIZE 12'hFC6 + `endif // VX_TYPES_VH diff --git a/runtime/simx/vortex.cpp b/runtime/simx/vortex.cpp index f65d7b385..4210ab0b6 100644 --- a/runtime/simx/vortex.cpp +++ b/runtime/simx/vortex.cpp @@ -32,7 +32,7 @@ using namespace vortex; class vx_device { public: vx_device() - : arch_(NUM_THREADS, NUM_WARPS, NUM_CORES, TC_SIZE, TC_NUM) + : arch_(NUM_THREADS, NUM_WARPS, NUM_CORES) , ram_(0, RAM_PAGE_SIZE) , processor_(arch_) , global_mem_(ALLOC_BASE_ADDR, @@ -69,12 +69,12 @@ class vx_device { case VX_CAPS_NUM_CORES: _value = NUM_CORES * NUM_CLUSTERS; break; - case VX_CAPS_TC_SIZE: - _value = TC_SIZE; - break; - case VX_CAPS_TC_NUM: - _value = TC_NUM; - break; + // case VX_CAPS_TC_SIZE: + // _value = TC_SIZE; + // break; + // case VX_CAPS_TC_NUM: + // _value = TC_NUM; + // break; case VX_CAPS_CACHE_LINE_SIZE: _value = CACHE_BLOCK_SIZE; break; diff --git a/sim/simx/arch.h b/sim/simx/arch.h index e35687dbd..9af266d7a 100644 --- a/sim/simx/arch.h +++ b/sim/simx/arch.h @@ -35,11 +35,9 @@ class Arch { uint16_t num_barriers_; uint16_t ipdom_size_; uint64_t local_mem_base_; - uint16_t tc_size_; - uint16_t tc_num_; public: - Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint64_t tc_size, uint64_t tc_num) + Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores) : num_threads_(num_threads) , num_warps_(num_warps) , num_cores_(num_cores) @@ -51,8 +49,6 @@ class Arch { , num_barriers_(NUM_BARRIERS) , ipdom_size_((num_threads-1) * 2) , local_mem_base_(LMEM_BASE_ADDR) - , tc_size_ (tc_size) - , tc_num_ (tc_num) {} uint16_t vsize() const { @@ -98,14 +94,6 @@ class Arch { uint16_t socket_size() const { return socket_size_; } - - uint16_t tc_size() const { - return tc_size_; - } - - uint16_t tc_num() const { - return tc_num_; - } }; diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp index ea5f72c42..d2faf7f98 100644 --- a/sim/simx/emulator.cpp +++ b/sim/simx/emulator.cpp @@ -74,7 +74,7 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core) , core_(core) , warps_(arch.num_warps(), arch) , barriers_(arch.num_barriers(), 0) - , scratchpad(std::vector(core->arch().tc_size() * core->arch().tc_size() * 32768)) //Fix this + , scratchpad(std::vector(32 * 32 * 32768)) //Fix this : Max TC_SIZE = 32 { this->clear(); } @@ -355,6 +355,11 @@ Word Emulator::get_tiles() return mat_size; } +Word Emulator::get_tc_size() +{ + return tc_size; +} + Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { auto core_perf = core_->perf_stats(); switch (addr) { @@ -387,6 +392,8 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_LOCAL_MEM_BASE: return arch_.local_mem_base(); case VX_CSR_MSCRATCH: return csr_mscratch_; case VX_MAT_MUL_SIZE: return mat_size; + case VX_TC_NUM: return tc_num; + case VX_TC_SIZE: return tc_size; CSR_READ_64(VX_CSR_MCYCLE, core_perf.cycles); CSR_READ_64(VX_CSR_MINSTRET, core_perf.instrs); @@ -500,6 +507,13 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) { case VX_MAT_MUL_SIZE: mat_size = value; break; + case VX_TC_NUM: + tc_num = value; + break; + case VX_TC_SIZE: + tc_size = value; + break; + default: { std::cout << std::hex << "Error: invalid CSR write addr=0x" << addr << ", value=0x" << value << std::endl; std::abort(); diff --git a/sim/simx/emulator.h b/sim/simx/emulator.h index 82b5bc98b..743c2786e 100644 --- a/sim/simx/emulator.h +++ b/sim/simx/emulator.h @@ -55,6 +55,7 @@ class Emulator { int get_exitcode() const; Word get_tiles(); + Word get_tc_size(); private: @@ -125,6 +126,8 @@ class Emulator { wspawn_t wspawn_; std::vector scratchpad; uint32_t mat_size; + uint32_t tc_size; + uint32_t tc_num; }; } diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp index e13df18b9..0dfd72a0f 100644 --- a/sim/simx/execute.cpp +++ b/sim/simx/execute.cpp @@ -1419,8 +1419,11 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { { //TODO - make it data-type flexible uint32_t mem_bytes = 1; DP(3, "mem_bytes=" << mem_bytes << std::endl); - uint16_t tc_size = core_->arch().tc_size(); - uint32_t TC_per_warp = core_->arch().tc_num(); + uint16_t tc_size = this->get_csr(VX_TC_SIZE, 0, wid); + uint32_t TC_per_warp = this->get_csr(VX_TC_NUM, 0, wid); + + DP(3, "tc_size=" << tc_size << std::endl); + DP(3, "TC_per_warp=" << TC_per_warp << std::endl); //Number of loads - dependant on the thread config uint32_t n_tiles = this->get_csr(VX_MAT_MUL_SIZE, 0, wid); //CSR instruction before MLOAD will ensure that this csr has value diff --git a/sim/simx/func_unit.cpp b/sim/simx/func_unit.cpp index 3991a17e7..f53a1fb22 100644 --- a/sim/simx/func_unit.cpp +++ b/sim/simx/func_unit.cpp @@ -255,7 +255,7 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) { TcuUnit::TcuUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "TCU") - , tc_size (core_->arch().tc_size()) + // , tc_size (core_->arch().tc_size()) {} void TcuUnit::tick() { @@ -267,6 +267,8 @@ void TcuUnit::tick() { auto& output = Outputs.at(i); auto trace = input.front(); uint32_t n_tiles = core_->emulator_.get_tiles(); + uint32_t tc_size = core_->emulator_.get_tc_size(); + switch (trace->tcu_type) { case TCUType::TCU_MUL: { //mat size = n_tiles * tc_size diff --git a/sim/simx/func_unit.h b/sim/simx/func_unit.h index 5fc922991..a7f182efe 100644 --- a/sim/simx/func_unit.h +++ b/sim/simx/func_unit.h @@ -103,7 +103,7 @@ class LsuUnit : public FuncUnit { class TcuUnit : public FuncUnit { public: TcuUnit(const SimContext& ctx, Core*); - uint64_t tc_size; + // uint64_t tc_size; void tick(); }; diff --git a/sim/simx/main.cpp b/sim/simx/main.cpp index 58eb96d61..9031a0a02 100644 --- a/sim/simx/main.cpp +++ b/sim/simx/main.cpp @@ -83,7 +83,7 @@ int main(int argc, char **argv) { { // create processor configuation - Arch arch(num_threads, num_warps, num_cores, tc_size, tc_num); + Arch arch(num_threads, num_warps, num_cores); // create memory module RAM ram(0, RAM_PAGE_SIZE); diff --git a/tests/regression/matmul/Makefile b/tests/regression/matmul/Makefile index 7f1c48523..0ef207194 100644 --- a/tests/regression/matmul/Makefile +++ b/tests/regression/matmul/Makefile @@ -9,6 +9,6 @@ SRCS := $(SRC_DIR)/main.cpp VX_SRCS := $(SRC_DIR)/kernel.cpp -OPTS ?= -n128 -d1 +OPTS ?= -n512 -d1 -s4 -t4 include ../common.mk diff --git a/tests/regression/matmul/kernel.cpp b/tests/regression/matmul/kernel.cpp index eeb902acb..a4585fb53 100644 --- a/tests/regression/matmul/kernel.cpp +++ b/tests/regression/matmul/kernel.cpp @@ -13,7 +13,7 @@ void kernel_body(kernel_arg_t* __UNIFORM__ arg) { unsigned c_addr = reinterpret_cast(dst_ptr); uint32_t tc_size = arg->tc_size; - int TC_per_warp = arg->TC_per_warp; + uint32_t TC_per_warp = arg->TC_per_warp; unsigned num_threads = arg->num_threads; int num_warps = arg->num_warps; uint32_t matrix_size = arg->matrix_size; @@ -104,6 +104,9 @@ void kernel_body(kernel_arg_t* __UNIFORM__ arg) { unsigned b_addr_base = b_addr + offset*arg->data_size; unsigned c_addr_base = c_addr + offset_c*arg->data_size; csr_write(VX_MAT_MUL_SIZE,n_tiles); + csr_write(VX_TC_NUM,TC_per_warp); + csr_write(VX_TC_SIZE,tc_size); + mload (0, a_addr_base); mload (1, b_addr_base); //In case of multiple threads - sync load diff --git a/tests/regression/matmul/main.cpp b/tests/regression/matmul/main.cpp index 6a86712ae..b2238bf5a 100644 --- a/tests/regression/matmul/main.cpp +++ b/tests/regression/matmul/main.cpp @@ -21,6 +21,9 @@ const char* kernel_file = "kernel.vxbin"; uint32_t matrix_size = 0; +uint32_t tc_num = 4; +uint32_t TC_size = 8; + vx_device_h device = nullptr; vx_buffer_h A_buffer = nullptr; vx_buffer_h B_buffer = nullptr; @@ -38,7 +41,7 @@ static void show_usage() { static void parse_args(int argc, char **argv, uint32_t &data_size) { int c; - while ((c = getopt(argc, argv, "n:k:d:h?")) != -1) { + while ((c = getopt(argc, argv, "n:k:d:t:s:h?")) != -1) { switch (c) { case 'n': matrix_size = atoi(optarg); @@ -48,7 +51,13 @@ static void parse_args(int argc, char **argv, uint32_t &data_size) { break; case 'd': data_size = atoi(optarg); - break; + break; + case 't': + tc_num = atoi(optarg); + break; + case 's': + TC_size = atoi(optarg); + break; case 'h': case '?': { show_usage(); @@ -141,12 +150,22 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - uint64_t num_cores, num_warps, num_threads, tc_size, TC_per_warp; + uint64_t num_cores, num_warps, num_threads; + uint32_t tc_size, TC_per_warp; + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads)); - RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size)); - RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp)); + + std::cout << "Debug :: tc_size (optarg) = " << TC_size << std::endl; + std::cout << "Debug :: tc_num (optarg) = " << tc_num << std::endl; + + //Add assert/knob + tc_size = TC_size; + TC_per_warp = tc_num; + + // RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size)); + // RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp)); std::cout << "Debug :: tc_size = " << tc_size << std::endl; std::cout << "Debug :: tc_num = " << TC_per_warp << std::endl; From 5b0fc8cbd43c813ced9790b287e8444ab619606b Mon Sep 17 00:00:00 2001 From: Nayan Sivakumar Nair Date: Tue, 25 Jun 2024 03:18:50 -0400 Subject: [PATCH 4/7] Fixes for PR --- ci/blackbox.sh | 3 --- ci/regression.sh.in | 4 ++- hw/rtl/VX_config.vh | 8 +++--- hw/rtl/VX_types.vh | 2 +- kernel/include/vx_intrinsics.h | 12 ++++----- run_final.sh | 22 ----------------- runtime/simx/vortex.cpp | 12 ++++----- sim/simx/decode.cpp | 6 ++--- sim/simx/emulator.cpp | 10 +++++++- sim/simx/emulator.h | 3 ++- sim/simx/execute.cpp | 22 ++++++----------- sim/simx/func_unit.cpp | 1 - sim/simx/func_unit.h | 1 - sim/simx/main.cpp | 2 -- tests/regression/matmul/Makefile | 2 +- tests/regression/matmul/kernel.cpp | 8 +++--- tests/regression/matmul/main.cpp | 22 +++-------------- tests/regression/matmul/matmul_regression.sh | 26 ++++++++++++++++++++ 18 files changed, 76 insertions(+), 90 deletions(-) delete mode 100755 run_final.sh create mode 100755 tests/regression/matmul/matmul_regression.sh diff --git a/ci/blackbox.sh b/ci/blackbox.sh index defad4c05..8bcb120f3 100755 --- a/ci/blackbox.sh +++ b/ci/blackbox.sh @@ -48,8 +48,6 @@ PERF_CLASS=0 REBUILD=2 TEMPBUILD=0 LOGFILE=run.log -TC_SIZE=567 -TC_NUM=123 for i in "$@" do @@ -182,7 +180,6 @@ then fi CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS" -# CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DTC_NUM=$TC_NUM -DTC_SIZE=$TC_SIZE $L2 $L3 $PERF_FLAG $CONFIGS" echo "CONFIGS=$CONFIGS" if [ $REBUILD -ne 0 ] diff --git a/ci/regression.sh.in b/ci/regression.sh.in index 50d309af6..3c89ac996 100755 --- a/ci/regression.sh.in +++ b/ci/regression.sh.in @@ -124,7 +124,9 @@ regression() # test local barrier ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -tbar" - + + # test for matmul + CONFIGS="-DTC_NUM=4 -DTC_SIZE=8" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args="-n128 -d1" echo "regression tests done!" } diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 651234768..ef9306503 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -111,20 +111,20 @@ `endif `define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE) +// Size of Tensor Core `ifndef TC_SIZE -`define TC_SIZE 4 +`define TC_SIZE 8 `endif +// Number of TCs per Warp `ifndef TC_NUM -`define TC_NUM 1 +`define TC_NUM 4 `endif -// Number of TCU units `ifndef NUM_TCU_LANES `define NUM_TCU_LANES `TC_NUM `endif -// Number of TCU units `ifndef NUM_TCU_BLOCKS `define NUM_TCU_BLOCKS `ISSUE_WIDTH `endif diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index 9a8f93234..23fb16904 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -196,7 +196,7 @@ `define VX_CSR_NUM_CORES 12'hFC2 `define VX_CSR_LOCAL_MEM_BASE 12'hFC3 -`define VX_MAT_MUL_SIZE 12'hFC4 +`define VX_MAT_MUL_SIZE 12'hFC4 // VX_MAT_MUL_SIZE = Matrix Size / TC Size `define VX_TC_NUM 12'hFC5 `define VX_TC_SIZE 12'hFC6 diff --git a/kernel/include/vx_intrinsics.h b/kernel/include/vx_intrinsics.h index b67a770da..5d16d44da 100644 --- a/kernel/include/vx_intrinsics.h +++ b/kernel/include/vx_intrinsics.h @@ -222,21 +222,19 @@ inline void vx_fence() { } //Matrix load -//Converted instruction type cause destination registers were not getiing blocked otherwise -inline void mload(unsigned dest, unsigned addr) +inline void vx_matrix_load(unsigned dest, unsigned addr) { asm volatile (".insn i 0x7b, 0, x0, %0(%1)" :: "i"(dest), "r"(addr)); } -//mat store -inline void ms(unsigned addr) +//Matrix Store +inline void vx_matrix_store(unsigned addr) { asm volatile (".insn i 0x7b, 1, x0, 0(%0)" :: "r"(addr)); } -//mat mul -//num tiles along reduced K dimension of matmul as imm value (can use rd,rs field to expand range of n_tiles from 12 bits) -inline void mm() +//Matrix Mul +inline void vx_matrix_mul() { asm volatile (".insn i 0x7b, 2, x0, 0(x0)"); } diff --git a/run_final.sh b/run_final.sh deleted file mode 100755 index 5f618dc64..000000000 --- a/run_final.sh +++ /dev/null @@ -1,22 +0,0 @@ -# Define arrays for threads, warps, and matrix sizes -matrix_sizes=(16 32 64 128 256 512) -tcsizes=(8 16 32) -tcnums=(4 8 16 32) -#lsulanes=(4 16) -#cores=(32) - - -# Loop through each combination of threads and warps -for size in "${matrix_sizes[@]}"; do - sed -i "s/OPTS ?= -n[0-9]\+/OPTS ?= -n${size}/" ../tests/regression/matmul/Makefile - sed -i "s/OPTS ?= -n[0-9]\+/OPTS ?= -n${size}/" tests/regression/matmul/Makefile - echo "Matrix size changed to ${size} in Makefile" - for tcsize in "${tcsizes[@]}"; do - for tcnum in "${tcnums[@]}"; do - log_name="sim_final/mat${size}/tcsize${tcsize}_tcnum${tcnum}_32w32t" - command="./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --tc_size=${tcsize} --tc_num=${tcnum} --rebuild=1 --perf=1 > ${log_name} 2>&1" - echo "$command" - eval "$command" - done - done -done diff --git a/runtime/simx/vortex.cpp b/runtime/simx/vortex.cpp index 4210ab0b6..5ab5e14f5 100644 --- a/runtime/simx/vortex.cpp +++ b/runtime/simx/vortex.cpp @@ -69,12 +69,12 @@ class vx_device { case VX_CAPS_NUM_CORES: _value = NUM_CORES * NUM_CLUSTERS; break; - // case VX_CAPS_TC_SIZE: - // _value = TC_SIZE; - // break; - // case VX_CAPS_TC_NUM: - // _value = TC_NUM; - // break; + case VX_CAPS_TC_SIZE: + _value = TC_SIZE; + break; + case VX_CAPS_TC_NUM: + _value = TC_NUM; + break; case VX_CAPS_CACHE_LINE_SIZE: _value = CACHE_BLOCK_SIZE; break; diff --git a/sim/simx/decode.cpp b/sim/simx/decode.cpp index 4d8d0a105..21d0e61dd 100644 --- a/sim/simx/decode.cpp +++ b/sim/simx/decode.cpp @@ -410,9 +410,9 @@ static const char* op_string(const Instr &instr) { case Opcode::TCU: switch(func3) { - case 0: return "ML"; // - case 1: return "MS"; // - case 2: return "MATMUL"; + case 0: return "ML"; // Matrix Load + case 1: return "MS"; // Matrix Store + case 2: return "MATMUL"; // Matrix Multiply default: std::abort(); } diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp index d2faf7f98..0dc8a06c4 100644 --- a/sim/simx/emulator.cpp +++ b/sim/simx/emulator.cpp @@ -74,7 +74,10 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core) , core_(core) , warps_(arch.num_warps(), arch) , barriers_(arch.num_barriers(), 0) - , scratchpad(std::vector(32 * 32 * 32768)) //Fix this : Max TC_SIZE = 32 + // Currently, tradeoff between scratchpad size & performance has not been evaluated. Scratchpad is + // considered to be big enough to hold input tiles for one output tile. + // In future versions, scratchpad size should be fixed to an appropriate value. + , scratchpad(std::vector(32 * 32 * 32768)) { this->clear(); } @@ -360,6 +363,11 @@ Word Emulator::get_tc_size() return tc_size; } +Word Emulator::get_tc_num() +{ + return tc_num; +} + Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { auto core_perf = core_->perf_stats(); switch (addr) { diff --git a/sim/simx/emulator.h b/sim/simx/emulator.h index 743c2786e..fe3aadf81 100644 --- a/sim/simx/emulator.h +++ b/sim/simx/emulator.h @@ -56,7 +56,8 @@ class Emulator { Word get_tiles(); Word get_tc_size(); - + Word get_tc_num(); + private: struct ipdom_entry_t { diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp index 0dfd72a0f..20025f40b 100644 --- a/sim/simx/execute.cpp +++ b/sim/simx/execute.cpp @@ -1429,8 +1429,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { uint32_t n_tiles = this->get_csr(VX_MAT_MUL_SIZE, 0, wid); //CSR instruction before MLOAD will ensure that this csr has value int num_data_per_thread; int num_data_per_thread_st; - int num_threads_actv; - int num_threads_actv_st; + uint32_t num_threads_actv; + uint32_t num_threads_actv_st; uint32_t data_bytes_load; uint32_t data_bytes_store; uint32_t num_threads_per_tc = MAX (1, num_threads/TC_per_warp); @@ -1506,7 +1506,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { auto trace_data = std::make_shared(num_threads); trace->data = trace_data; - uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2; for (uint32_t t = thread_start; t < num_threads_actv_st; ++t) { @@ -1521,12 +1520,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { //Store C for (int n=0; n csr (TODO :: removed intermediate CSR stage ; incorporate limited scratchmad implementation) - //core_->set_csr(csr_addr[(2*num_data_per_thread) + n], scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread) + n], t, warp_id_); Word* temp_ref = &(warp.ireg_file.at(t).at(rsrc0)); *temp_ref = scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread_st) + n]; @@ -1534,7 +1527,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { } } //Clear the scratchpad - for(int i =0 ; i < scratchpad.size(); i++) + for(long unsigned int i=0 ; i < scratchpad.size(); i++) { scratchpad[i] = 0; } @@ -1545,7 +1538,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { DP(4, "TCU MULTIPLY MAT"); trace->fu_type = FUType::TCU; trace->tcu_type = TCUType::TCU_MUL; - uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2; uint32_t threads_per_tc = MAX (1, num_threads/TC_per_warp); for (uint32_t t = thread_start; t < num_threads_actv; ++t) { @@ -1556,12 +1548,14 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { //TC operation [only 1 thread in 1 warp needs to do this] if (t%threads_per_tc == 0) { - //TODO : change to systolic array implementation + /* + // TODO : Fix needed for functional correctness + // TODO : change to systolic array implementation uint32_t thread_offset = t*(tc_size*tc_size); + int loop_offset = 0; int offset_b = n_tiles*n_tiles*n_tiles*tc_size*tc_size; - /* - // TODO : Fix needed for functional correctness + uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2; for(int tiles = 0 ; tiles < n_tiles ; tiles++) //What's the HW implication of this?? A counter implementation? { for (int i = 0; i < tc_size; i++) { //ROW-1 diff --git a/sim/simx/func_unit.cpp b/sim/simx/func_unit.cpp index f53a1fb22..8acbfddeb 100644 --- a/sim/simx/func_unit.cpp +++ b/sim/simx/func_unit.cpp @@ -255,7 +255,6 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) { TcuUnit::TcuUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "TCU") - // , tc_size (core_->arch().tc_size()) {} void TcuUnit::tick() { diff --git a/sim/simx/func_unit.h b/sim/simx/func_unit.h index a7f182efe..cf119a5c3 100644 --- a/sim/simx/func_unit.h +++ b/sim/simx/func_unit.h @@ -103,7 +103,6 @@ class LsuUnit : public FuncUnit { class TcuUnit : public FuncUnit { public: TcuUnit(const SimContext& ctx, Core*); - // uint64_t tc_size; void tick(); }; diff --git a/sim/simx/main.cpp b/sim/simx/main.cpp index 9031a0a02..0f61de6f4 100644 --- a/sim/simx/main.cpp +++ b/sim/simx/main.cpp @@ -35,8 +35,6 @@ static void show_usage() { uint32_t num_threads = NUM_THREADS; uint32_t num_warps = NUM_WARPS; uint32_t num_cores = NUM_CORES; -uint32_t tc_size = TC_SIZE; -uint32_t tc_num = TC_NUM; bool showStats = false; const char* program = nullptr; diff --git a/tests/regression/matmul/Makefile b/tests/regression/matmul/Makefile index 0ef207194..7f1c48523 100644 --- a/tests/regression/matmul/Makefile +++ b/tests/regression/matmul/Makefile @@ -9,6 +9,6 @@ SRCS := $(SRC_DIR)/main.cpp VX_SRCS := $(SRC_DIR)/kernel.cpp -OPTS ?= -n512 -d1 -s4 -t4 +OPTS ?= -n128 -d1 include ../common.mk diff --git a/tests/regression/matmul/kernel.cpp b/tests/regression/matmul/kernel.cpp index a4585fb53..b0b4753c7 100644 --- a/tests/regression/matmul/kernel.cpp +++ b/tests/regression/matmul/kernel.cpp @@ -107,15 +107,15 @@ void kernel_body(kernel_arg_t* __UNIFORM__ arg) { csr_write(VX_TC_NUM,TC_per_warp); csr_write(VX_TC_SIZE,tc_size); - mload (0, a_addr_base); - mload (1, b_addr_base); + vx_matrix_load (0, a_addr_base); + vx_matrix_load (1, b_addr_base); //In case of multiple threads - sync load vx_fence(); - mm(); //Assuming padding to ensure matrix size is a multiple of tc_size + vx_matrix_mul(); //Assuming padding to ensure matrix size is a multiple of tc_size vx_fence(); if (((task_id%num_tasks_per_warp)/num_tasks_per_thread) < thread_limit_c) - ms(c_addr_base); + vx_matrix_store(c_addr_base); //In case of multiple threads - sync store vx_fence(); } diff --git a/tests/regression/matmul/main.cpp b/tests/regression/matmul/main.cpp index b2238bf5a..9b3465c52 100644 --- a/tests/regression/matmul/main.cpp +++ b/tests/regression/matmul/main.cpp @@ -21,8 +21,6 @@ const char* kernel_file = "kernel.vxbin"; uint32_t matrix_size = 0; -uint32_t tc_num = 4; -uint32_t TC_size = 8; vx_device_h device = nullptr; vx_buffer_h A_buffer = nullptr; @@ -41,7 +39,7 @@ static void show_usage() { static void parse_args(int argc, char **argv, uint32_t &data_size) { int c; - while ((c = getopt(argc, argv, "n:k:d:t:s:h?")) != -1) { + while ((c = getopt(argc, argv, "n:k:d:h?")) != -1) { switch (c) { case 'n': matrix_size = atoi(optarg); @@ -52,12 +50,6 @@ static void parse_args(int argc, char **argv, uint32_t &data_size) { case 'd': data_size = atoi(optarg); break; - case 't': - tc_num = atoi(optarg); - break; - case 's': - TC_size = atoi(optarg); - break; case 'h': case '?': { show_usage(); @@ -151,21 +143,15 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_dev_open(&device)); uint64_t num_cores, num_warps, num_threads; - uint32_t tc_size, TC_per_warp; + uint64_t tc_size, TC_per_warp; RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores)); RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps)); RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads)); - std::cout << "Debug :: tc_size (optarg) = " << TC_size << std::endl; - std::cout << "Debug :: tc_num (optarg) = " << tc_num << std::endl; - //Add assert/knob - tc_size = TC_size; - TC_per_warp = tc_num; - - // RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size)); - // RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp)); std::cout << "Debug :: tc_size = " << tc_size << std::endl; std::cout << "Debug :: tc_num = " << TC_per_warp << std::endl; diff --git a/tests/regression/matmul/matmul_regression.sh b/tests/regression/matmul/matmul_regression.sh new file mode 100755 index 000000000..8d35fcfd3 --- /dev/null +++ b/tests/regression/matmul/matmul_regression.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# README: +# This script launches a sweep of TC_SIZE, TC_NUM and MATRIX SIZES +# default values of NUM_WARPS=32, NUM_THREADS=32, NUM_CORES=4, DATA_SIZE=1 +# Edit matrix_sizes, tcsizes & tcnums variables to vary the sweep limits + +# Define arrays for tc_size,tc_num and matrix sizes +matrix_sizes=(16 32 64 128 256 512) +tcsizes=(8 16 32) +tcnums=(4 8 16 32) + +cd ../../../build/ + +# Loop through each combination of above configs +for size in "${matrix_sizes[@]}"; do + for tcsize in "${tcsizes[@]}"; do + for tcnum in "${tcnums[@]}"; do + mkdir -p sim_final/mat${size} + log_name="sim_final/mat${size}/tcsize${tcsize}_tcnum${tcnum}_32w32t" + cmd="CONFIGS=\"-DTC_NUM=${tcnum} -DTC_SIZE=${tcsize}\" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args=\"-n${size} -d1\" --rebuild=1 --perf=1 > ${log_name} 2>&1" + echo $cmd + eval $cmd + done + done +done From 5e63b8f35ac3b695d574fde4ad0a280ecbe1b83a Mon Sep 17 00:00:00 2001 From: Nayan Sivakumar Nair Date: Tue, 25 Jun 2024 23:27:18 -0400 Subject: [PATCH 5/7] dummy commit --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 6eeb1ccfa..a1593a67a 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ Vortex is a full-stack open-source RISC-V GPGPU. ## Specifications - Support RISC-V RV32IMAF and RV64IMAFD + - Microarchitecture: - configurable number of cores, warps, and threads. - configurable number of ALU, FPU, LSU, and SFU units per core. From d1175a03c9606dce16cdc8f16772fba701fbb0af Mon Sep 17 00:00:00 2001 From: jaewon-lee-github Date: Wed, 2 Oct 2024 14:16:57 -0400 Subject: [PATCH 6/7] update the code accessing registers in obsoleted way --- sim/simx/emulator.cpp | 2 +- sim/simx/execute.cpp | 2 +- sim/simx/func_unit.cpp | 6 +++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp index 8d57f69fa..08c51845c 100644 --- a/sim/simx/emulator.cpp +++ b/sim/simx/emulator.cpp @@ -74,11 +74,11 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core) , core_(core) , warps_(arch.num_warps(), arch) , barriers_(arch.num_barriers(), 0) + , ipdom_size_(arch.num_threads()-1) // [TBC] Currently, tradeoff between scratchpad size & performance has not been evaluated. Scratchpad is // considered to be big enough to hold input tiles for one output tile. // In future versions, scratchpad size should be fixed to an appropriate value. , scratchpad(std::vector(32 * 32 * 32768)) - , ipdom_size_(arch.num_threads()-1) { this->clear(); } diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp index a7d8a937d..e70d45cb2 100644 --- a/sim/simx/execute.cpp +++ b/sim/simx/execute.cpp @@ -1473,7 +1473,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { trace->fu_type = FUType::LSU; trace->lsu_type = LsuType::TCU_LOAD; - trace->used_iregs.set(rsrc0); + trace->src_regs[0] = {RegType::Integer, rsrc0}; auto trace_data = std::make_shared(num_threads); trace->data = trace_data; diff --git a/sim/simx/func_unit.cpp b/sim/simx/func_unit.cpp index 2de58639b..a182f6d8b 100644 --- a/sim/simx/func_unit.cpp +++ b/sim/simx/func_unit.cpp @@ -222,7 +222,10 @@ void LsuUnit::tick() { input.pop(); } } -///////// TENSOR code TBC //////////////////////////////// +/* TO BE FIXED:Tensor_core code + send_request is not used anymore. Need to be modified number of load +*/ +/* int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) { int count = 0; @@ -275,6 +278,7 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) { } return count; } +*/ /////////////////////////////////////////////////////////////////////////////// From b7531c9de1d4acbc33e2040fb6f4f100eb96d015 Mon Sep 17 00:00:00 2001 From: jaewon-lee-github Date: Wed, 2 Oct 2024 17:46:01 -0400 Subject: [PATCH 7/7] support 64bit --- tests/regression/matmul/kernel.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/regression/matmul/kernel.cpp b/tests/regression/matmul/kernel.cpp index b0b4753c7..5fa976df4 100644 --- a/tests/regression/matmul/kernel.cpp +++ b/tests/regression/matmul/kernel.cpp @@ -8,9 +8,9 @@ void kernel_body(kernel_arg_t* __UNIFORM__ arg) { int32_t* src0_ptr = (int32_t*)arg->src0_addr; int32_t* src1_ptr = (int32_t*)arg->src1_addr; int32_t* dst_ptr = (int32_t*)arg->dst_addr; - unsigned a_addr = reinterpret_cast(src0_ptr); - unsigned b_addr = reinterpret_cast(src1_ptr); - unsigned c_addr = reinterpret_cast(dst_ptr); + uint64_t a_addr = reinterpret_cast(src0_ptr); + uint64_t b_addr = reinterpret_cast(src1_ptr); + uint64_t c_addr = reinterpret_cast(dst_ptr); uint32_t tc_size = arg->tc_size; uint32_t TC_per_warp = arg->TC_per_warp; @@ -100,9 +100,9 @@ void kernel_body(kernel_arg_t* __UNIFORM__ arg) { //TODO :: change this for new task->thread distribution if (((task_id%num_tasks_per_warp)/num_tasks_per_thread) < thread_limit) { - unsigned a_addr_base = a_addr + offset*arg->data_size; - unsigned b_addr_base = b_addr + offset*arg->data_size; - unsigned c_addr_base = c_addr + offset_c*arg->data_size; + uint64_t a_addr_base = a_addr + offset*arg->data_size; + uint64_t b_addr_base = b_addr + offset*arg->data_size; + uint64_t c_addr_base = c_addr + offset_c*arg->data_size; csr_write(VX_MAT_MUL_SIZE,n_tiles); csr_write(VX_TC_NUM,TC_per_warp); csr_write(VX_TC_SIZE,tc_size);