From 99c6a1af5a58cbd915f2a0ed9fe279d77d99498a Mon Sep 17 00:00:00 2001
From: Varsha Singhania <vsinghania8@gatech.edu>
Date: Mon, 17 Jun 2024 04:28:51 -0400
Subject: [PATCH 1/7] Tensor cores in Vortex

---
 ci/blackbox.sh                     |  12 +-
 hw/rtl/VX_config.vh                |  18 ++
 hw/rtl/VX_types.vh                 |   3 +
 kernel/include/vx_intrinsics.h     |  20 ++
 runtime/include/vortex.h           |   2 +
 runtime/simx/vortex.cpp            |   8 +-
 sim/simx/arch.h                    |  15 +-
 sim/simx/core.cpp                  |   4 +-
 sim/simx/core.h                    |   1 +
 sim/simx/decode.cpp                |  19 ++
 sim/simx/emulator.cpp              |  18 +-
 sim/simx/emulator.h                |   4 +
 sim/simx/execute.cpp               | 179 +++++++++++++++
 sim/simx/func_unit.cpp             |  91 ++++++--
 sim/simx/func_unit.h               |   9 +
 sim/simx/instr.h                   |   2 +-
 sim/simx/instr_trace.h             |   1 +
 sim/simx/main.cpp                  |   4 +-
 sim/simx/types.h                   |  23 +-
 tests/regression/matmul/Makefile   |  14 ++
 tests/regression/matmul/common.h   |  17 ++
 tests/regression/matmul/kernel.cpp | 124 +++++++++++
 tests/regression/matmul/main.cpp   | 343 +++++++++++++++++++++++++++++
 23 files changed, 899 insertions(+), 32 deletions(-)
 create mode 100644 tests/regression/matmul/Makefile
 create mode 100644 tests/regression/matmul/common.h
 create mode 100644 tests/regression/matmul/kernel.cpp
 create mode 100644 tests/regression/matmul/main.cpp

diff --git a/ci/blackbox.sh b/ci/blackbox.sh
index fe94677aa..8a04133f9 100755
--- a/ci/blackbox.sh
+++ b/ci/blackbox.sh
@@ -48,6 +48,8 @@ PERF_CLASS=0
 REBUILD=2
 TEMPBUILD=0
 LOGFILE=run.log
+TC_SIZE=567
+TC_NUM=123
 
 for i in "$@"
 do
@@ -112,6 +114,14 @@ case $i in
         LOGFILE=${i#*=}
         shift
         ;;
+    --tc_size=*)
+        TC_SIZE=${i#*=}
+        shift
+        ;;
+    --tc_num=*)
+        TC_NUM=${i#*=}
+        shift
+        ;;
     --help)
         show_help
         exit 0
@@ -180,7 +190,7 @@ then
 fi
 
 CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS"
-
+CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DTC_NUM=$TC_NUM -DTC_SIZE=$TC_SIZE $L2 $L3 $PERF_FLAG $CONFIGS"
 echo "CONFIGS=$CONFIGS"
 
 if [ $REBUILD -ne 0 ]
diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh
index 7fc8d1464..651234768 100644
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@@ -111,6 +111,24 @@
 `endif
 `define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE)
 
+`ifndef TC_SIZE
+`define TC_SIZE 4
+`endif
+
+`ifndef TC_NUM
+`define TC_NUM 1
+`endif
+
+// Number of TCU units
+`ifndef NUM_TCU_LANES
+`define NUM_TCU_LANES   `TC_NUM
+`endif
+
+// Number of TCU units
+`ifndef NUM_TCU_BLOCKS
+`define NUM_TCU_BLOCKS  `ISSUE_WIDTH
+`endif
+
 `ifdef L2_ENABLE
     `define L2_ENABLED   1
 `else
diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh
index e744a26f9..06929b058 100644
--- a/hw/rtl/VX_types.vh
+++ b/hw/rtl/VX_types.vh
@@ -196,4 +196,7 @@
 `define VX_CSR_NUM_CORES                12'hFC2
 `define VX_CSR_LOCAL_MEM_BASE           12'hFC3
 
+`define VX_MAT_MUL_SIZE                 12'hFC4
+
+
 `endif // VX_TYPES_VH
diff --git a/kernel/include/vx_intrinsics.h b/kernel/include/vx_intrinsics.h
index 6000065e9..b67a770da 100644
--- a/kernel/include/vx_intrinsics.h
+++ b/kernel/include/vx_intrinsics.h
@@ -221,6 +221,26 @@ inline void vx_fence() {
     __asm__ volatile ("fence iorw, iorw");
 }
 
+//Matrix load
+//Converted instruction type cause destination registers were not getiing blocked otherwise
+inline void mload(unsigned dest, unsigned  addr) 
+{
+    asm volatile (".insn i 0x7b, 0, x0, %0(%1)" :: "i"(dest), "r"(addr));
+}
+
+//mat store
+inline void ms(unsigned  addr) 
+{
+    asm volatile (".insn i 0x7b, 1, x0, 0(%0)" :: "r"(addr));
+}
+
+//mat mul
+//num tiles along reduced K dimension of matmul as imm value (can use rd,rs field to expand range of n_tiles from 12 bits)
+inline void mm() 
+{
+    asm volatile (".insn i 0x7b, 2, x0, 0(x0)");
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/runtime/include/vortex.h b/runtime/include/vortex.h
index c9dd6ec36..f1a412b81 100644
--- a/runtime/include/vortex.h
+++ b/runtime/include/vortex.h
@@ -34,6 +34,8 @@ typedef void* vx_buffer_h;
 #define VX_CAPS_GLOBAL_MEM_SIZE     0x5
 #define VX_CAPS_LOCAL_MEM_SIZE      0x6
 #define VX_CAPS_ISA_FLAGS           0x7
+#define VX_CAPS_TC_SIZE             0x8
+#define VX_CAPS_TC_NUM              0x9
 
 // device isa flags
 #define VX_ISA_STD_A                (1ull << 0)
diff --git a/runtime/simx/vortex.cpp b/runtime/simx/vortex.cpp
index 89856f3a0..f65d7b385 100644
--- a/runtime/simx/vortex.cpp
+++ b/runtime/simx/vortex.cpp
@@ -32,7 +32,7 @@ using namespace vortex;
 class vx_device {
 public:
   vx_device()
-    : arch_(NUM_THREADS, NUM_WARPS, NUM_CORES)
+    : arch_(NUM_THREADS, NUM_WARPS, NUM_CORES, TC_SIZE, TC_NUM)
     , ram_(0, RAM_PAGE_SIZE)
     , processor_(arch_)
     , global_mem_(ALLOC_BASE_ADDR,
@@ -69,6 +69,12 @@ class vx_device {
     case VX_CAPS_NUM_CORES:
       _value = NUM_CORES * NUM_CLUSTERS;
       break;
+    case VX_CAPS_TC_SIZE:
+      _value = TC_SIZE;
+      break;
+    case VX_CAPS_TC_NUM:
+      _value = TC_NUM;
+      break;
     case VX_CAPS_CACHE_LINE_SIZE:
       _value = CACHE_BLOCK_SIZE;
       break;
diff --git a/sim/simx/arch.h b/sim/simx/arch.h
index 2507bf28f..e35687dbd 100644
--- a/sim/simx/arch.h
+++ b/sim/simx/arch.h
@@ -35,9 +35,11 @@ class Arch {
   uint16_t num_barriers_;
   uint16_t ipdom_size_;
   uint64_t local_mem_base_;
+  uint16_t tc_size_;
+  uint16_t tc_num_;
 
 public:
-  Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores)
+  Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint64_t tc_size, uint64_t tc_num)   
     : num_threads_(num_threads)
     , num_warps_(num_warps)
     , num_cores_(num_cores)
@@ -49,6 +51,8 @@ class Arch {
     , num_barriers_(NUM_BARRIERS)
     , ipdom_size_((num_threads-1) * 2)
     , local_mem_base_(LMEM_BASE_ADDR)
+    , tc_size_ (tc_size)
+    , tc_num_ (tc_num)
   {}
 
   uint16_t vsize() const {
@@ -94,6 +98,15 @@ class Arch {
   uint16_t socket_size() const {
     return socket_size_;
   }
+  
+  uint16_t tc_size() const {
+    return tc_size_;
+  }
+
+  uint16_t tc_num() const {
+    return tc_num_;
+  }
+
 };
 
 }
\ No newline at end of file
diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp
index 0bd72524d..7020cf8ff 100644
--- a/sim/simx/core.cpp
+++ b/sim/simx/core.cpp
@@ -105,12 +105,14 @@ Core::Core(const SimContext& ctx,
   dispatchers_.at((int)FUType::FPU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES);
   dispatchers_.at((int)FUType::LSU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_LSU_BLOCKS, NUM_LSU_LANES);
   dispatchers_.at((int)FUType::SFU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_SFU_BLOCKS, NUM_SFU_LANES);
-
+  dispatchers_.at((int)FUType::TCU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_TCU_BLOCKS, NUM_TCU_LANES);
+  
   // initialize execute units
   func_units_.at((int)FUType::ALU) = SimPlatform::instance().create_object<AluUnit>(this);
   func_units_.at((int)FUType::FPU) = SimPlatform::instance().create_object<FpuUnit>(this);
   func_units_.at((int)FUType::LSU) = SimPlatform::instance().create_object<LsuUnit>(this);
   func_units_.at((int)FUType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this);
+  func_units_.at((int)FUType::TCU) = SimPlatform::instance().create_object<TcuUnit>(this);
 
   // bind commit arbiters
   for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
diff --git a/sim/simx/core.h b/sim/simx/core.h
index cc0e46c8c..0b82de84a 100644
--- a/sim/simx/core.h
+++ b/sim/simx/core.h
@@ -170,6 +170,7 @@ class Core : public SimObject<Core> {
   friend class AluUnit;
   friend class FpuUnit;
   friend class SfuUnit;
+  friend class TcuUnit;
 };
 
 } // namespace vortex
diff --git a/sim/simx/decode.cpp b/sim/simx/decode.cpp
index f934524c3..4d8d0a105 100644
--- a/sim/simx/decode.cpp
+++ b/sim/simx/decode.cpp
@@ -51,6 +51,7 @@ static const std::unordered_map<Opcode, InstType> sc_instTable = {
   {Opcode::EXT2,    InstType::R4},
   {Opcode::R_W,     InstType::R},
   {Opcode::I_W,     InstType::I},
+  {Opcode::TCU,     InstType::I},
 };
 
 enum Constants {
@@ -405,6 +406,16 @@ static const char* op_string(const Instr &instr) {
     default:
       std::abort();
     }
+  
+  case Opcode::TCU:
+    switch(func3)
+    {
+      case 0: return "ML"; //
+      case 1: return "MS"; //
+      case 2: return "MATMUL";
+      default:
+        std::abort();
+    }
   default:
     std::abort();
   }
@@ -543,6 +554,14 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
 
   case InstType::I: {
     switch (op) {
+    case Opcode::TCU: {
+      instr->setDestReg(rs1, RegType::Integer);
+      instr->addSrcReg(rs1, RegType::Integer);
+      instr->setFunc3(func3);
+      instr->setFunc7(func7);
+      auto imm = code >> shift_rs2;
+      instr->setImm(sext(imm, width_i_imm));
+    } break;
     case Opcode::I:
     case Opcode::I_W:
     case Opcode::JALR:
diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp
index 5850bfd56..ea5f72c42 100644
--- a/sim/simx/emulator.cpp
+++ b/sim/simx/emulator.cpp
@@ -74,6 +74,7 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
     , core_(core)
     , warps_(arch.num_warps(), arch)
     , barriers_(arch.num_barriers(), 0)
+    , scratchpad(std::vector<Word>(core->arch().tc_size() * core->arch().tc_size() * 32768)) //Fix this
 {
   this->clear();
 }
@@ -110,6 +111,11 @@ void Emulator::clear() {
   active_warps_.set(0);
   warps_[0].tmask.set(0);
   wspawn_.valid = false;
+
+  for (auto& reg : scratchpad) 
+  {
+    reg = 0;
+  }
 }
 
 void Emulator::attach_ram(RAM* ram) {
@@ -344,6 +350,11 @@ void Emulator::cout_flush() {
     case (addr + (VX_CSR_MPM_BASE_H-VX_CSR_MPM_BASE)) : return ((value >> 32) & 0xFFFFFFFF)
 #endif
 
+Word Emulator::get_tiles()
+{
+  return mat_size;
+}
+
 Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
   auto core_perf = core_->perf_stats();
   switch (addr) {
@@ -375,6 +386,8 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
   case VX_CSR_NUM_CORES:  return uint32_t(arch_.num_cores()) * arch_.num_clusters();
   case VX_CSR_LOCAL_MEM_BASE: return arch_.local_mem_base();
   case VX_CSR_MSCRATCH:   return csr_mscratch_;
+  case VX_MAT_MUL_SIZE:   return mat_size;
+
   CSR_READ_64(VX_CSR_MCYCLE, core_perf.cycles);
   CSR_READ_64(VX_CSR_MINSTRET, core_perf.instrs);
   default:
@@ -484,6 +497,9 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) {
   case VX_CSR_MNSTATUS:
   case VX_CSR_MCAUSE:
     break;
+  case VX_MAT_MUL_SIZE:
+    mat_size = value;
+    break;
   default: {
       std::cout << std::hex << "Error: invalid CSR write addr=0x" << addr << ", value=0x" << value << std::endl;
       std::abort();
@@ -500,4 +516,4 @@ void Emulator::update_fcrs(uint32_t fflags, uint32_t tid, uint32_t wid) {
     this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, tid, wid) | fflags, tid, wid);
     this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, tid, wid) | fflags, tid, wid);
   }
-}
\ No newline at end of file
+}
diff --git a/sim/simx/emulator.h b/sim/simx/emulator.h
index 81dcecd83..82b5bc98b 100644
--- a/sim/simx/emulator.h
+++ b/sim/simx/emulator.h
@@ -53,6 +53,8 @@ class Emulator {
   bool wspawn(uint32_t num_warps, Word nextPC);
 
   int get_exitcode() const;
+  
+  Word get_tiles();
 
 private:
 
@@ -121,6 +123,8 @@ class Emulator {
   MemoryUnit  mmu_;
   Word        csr_mscratch_;
   wspawn_t    wspawn_;
+  std::vector<Word> scratchpad;
+  uint32_t mat_size;
 };
 
 }
diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp
index e0fc2b94a..d522145db 100644
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@@ -25,6 +25,7 @@
 #include "emulator.h"
 #include "instr.h"
 #include "core.h"
+#include "VX_types.h"
 
 using namespace vortex;
 
@@ -1414,6 +1415,184 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
       std::abort();
     }
   } break;
+  case Opcode::TCU: 
+  { //TODO - make it data-type flexible
+    uint32_t mem_bytes = 1;
+    DP(3, "mem_bytes=" << mem_bytes << std::endl);
+    uint16_t tc_size = core_->arch().tc_size();
+    uint32_t TC_per_warp = core_->arch().tc_num();
+
+    //Number of loads - dependant on the thread config
+    uint32_t n_tiles = this->get_csr(VX_MAT_MUL_SIZE, 0, wid);  //CSR instruction before MLOAD will ensure that this csr has value
+    int num_data_per_thread;
+    int num_data_per_thread_st;
+    int num_threads_actv;
+    int num_threads_actv_st;
+    uint32_t data_bytes_load;
+    uint32_t data_bytes_store;
+    uint32_t num_threads_per_tc = MAX (1, num_threads/TC_per_warp);
+
+    //int num_warps = MIN()
+    //int active_tcs =  MIN (TC_per_warp, num_output_tiles/num_warps)
+    //LOAD
+    if(num_threads > tc_size*tc_size*n_tiles*TC_per_warp)
+    { 
+      num_threads_actv = tc_size*tc_size*n_tiles*TC_per_warp;
+      num_data_per_thread = 1;
+    }
+    else
+    {
+      num_threads_actv = num_threads;
+      num_data_per_thread = (tc_size*tc_size*n_tiles)/num_threads_per_tc;
+    }
+    data_bytes_load = mem_bytes*num_data_per_thread;
+
+    //STORE
+    
+    // DP(3, "DEBUG :: num_threads = " << num_threads);
+    // DP(3, "DEBUG :: tc_size*tc_size = " << tc_size*tc_size);
+    //DP(3, "imm = " << immsrc);
+    
+    if(num_threads > tc_size*tc_size*TC_per_warp)
+    { 
+      num_threads_actv_st = tc_size*tc_size*TC_per_warp;
+      num_data_per_thread_st = 1;
+    }
+    else
+    {
+      num_threads_actv_st = num_threads;
+      num_data_per_thread_st = (tc_size*tc_size)/num_threads_per_tc;
+    }
+    data_bytes_store = mem_bytes*num_data_per_thread_st;
+    
+    DP(3, "Num Tiles=" << n_tiles << std::endl);
+    
+    switch (func3) {
+      case 0: 
+      { //Matrix Load  
+
+        DP (4, "TCU LOAD");
+        trace->fu_type = FUType::LSU;
+        trace->lsu_type = LsuType::TCU_LOAD;
+        
+        trace->used_iregs.set(rsrc0);
+        auto trace_data = std::make_shared<LsuTraceData>(num_threads);
+        trace->data = trace_data;
+        
+        for (uint32_t t = thread_start; t < num_threads_actv; ++t) 
+        {
+          if (!warp.tmask.test(t))
+            continue;
+          DP(3, "Thread ID" << t); 
+
+          uint32_t base_addr = rsdata[t][0].i ;
+          trace_data->mem_addrs.at(t) = {base_addr, data_bytes_load};
+          
+          //Load A or B (depends on immsrc)
+          int loop_offset = 0;
+          DP(3, "n_tiles = " << n_tiles << "; num_data_per_thread = " << num_data_per_thread <<std::endl);
+            for (int n=0; n<num_data_per_thread; n++)
+            {
+              Word* temp_ref = &(warp.ireg_file.at(t).at(rsrc0));
+              this->dcache_read(temp_ref, (base_addr+(n*mem_bytes)+(loop_offset*mem_bytes)), mem_bytes);
+
+              scratchpad[loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n] = *temp_ref;
+              DP(3, "Scratchpad Index: " << loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n << ", Value: " << scratchpad[loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n]);
+            }
+            //loop_offset += tc_size*tc_size;
+          //}
+        }
+        rd_write = true;  
+      } break;
+      case 1: 
+      { 
+        DP(4, "TCU STORE");
+        trace->fu_type = FUType::LSU;
+        trace->lsu_type = LsuType::TCU_STORE;
+
+        auto trace_data = std::make_shared<LsuTraceData>(num_threads);
+        trace->data = trace_data;
+        uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2;
+
+        for (uint32_t t = thread_start; t < num_threads_actv_st; ++t) 
+        {
+          if (!warp.tmask.test(t))
+            continue;
+
+          DP(3, "Thread ID" << t); 
+          uint32_t base_addr = rsdata[t][0].i ;
+
+          trace_data->mem_addrs.at(t) = {base_addr, data_bytes_store};
+
+          //Store C
+          for (int n=0; n<num_data_per_thread_st; n++)
+          {
+            uint64_t mem_addr = (base_addr+(n*mem_bytes));
+            uint32_t csr_index = (2*num_data_per_thread_st) + n;
+            uint32_t scratchpad_index = (tc_size*tc_size*2) + (t*num_data_per_thread) + n;
+            
+            //scratchpad -> csr (TODO :: can intermediate step of moving to CSR be skipped?)
+            //core_->set_csr(csr_addr[(2*num_data_per_thread) + n], scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread) + n], t, warp_id_);
+            Word* temp_ref = &(warp.ireg_file.at(t).at(rsrc0));
+            *temp_ref = scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread_st) + n];
+
+            this->dcache_write(temp_ref, base_addr+(n*mem_bytes), mem_bytes);  
+          }
+        }
+        //Clear the scratchpad
+        for(int i =0 ; i < scratchpad.size(); i++)
+        {
+          scratchpad[i] = 0;
+        }
+      }
+      break;
+      case 2: 
+      { //Matrix Multiply
+        DP(4, "TCU MULTIPLY MAT");
+        trace->fu_type = FUType::TCU;
+        trace->tcu_type = TCUType::TCU_MUL;
+        uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2;
+        uint32_t threads_per_tc = MAX (1, num_threads/TC_per_warp);
+        for (uint32_t t = thread_start; t < num_threads_actv; ++t) 
+        {
+          if (!warp.tmask.test(t))
+            continue;
+         
+          DP(3, "Thread ID" << t); 
+          //TC operation [only 1 thread in 1 warp needs to do this]
+          if (t%threads_per_tc == 0)
+          {
+            //TODO - change to systolic array implementation
+            uint32_t thread_offset = t*(tc_size*tc_size);
+            int loop_offset = 0;
+            int offset_b = n_tiles*n_tiles*n_tiles*tc_size*tc_size;
+            // Loop over all tiles - output stationary
+            //for(int tiles = 0 ; tiles < n_tiles ; tiles++)  //What's the HW implication of this?? A counter implementation?
+            //{ 
+              /*
+              for (int i = 0; i < tc_size; i++) { //ROW-1
+                for (int j = 0; j < tc_size; j++) { //COL-2
+                  int sum = 0;
+                  for (int k = 0; k < tc_size; k++)
+                  { //COL-1
+                    sum = sum + scratchpad[loop_offset + thread_offset*n_tiles + i * tc_size + k] *scratchpad[loop_offset + thread_offset*n_tiles + offset_b + (k * tc_size + j)];
+                  }
+                  scratchpad[accu_offset + thread_offset +(i * tc_size + j)] += sum; //[i * col2 + j] = sum
+                  DP(3, "Scratchpad Index: " << accu_offset + (i * tc_size + j) << " , Value=" << scratchpad[accu_offset + (i * tc_size + j)]);
+
+                }
+              }
+              */
+              //loop_offset += tc_size*tc_size; //Move to the next tiled matmul fragment
+            //}
+          }
+        }
+
+      }break;
+      default:
+        std::abort();
+    }
+  } break;
   default:
     std::abort();
   }
diff --git a/sim/simx/func_unit.cpp b/sim/simx/func_unit.cpp
index c9a3f0fc7..3991a17e7 100644
--- a/sim/simx/func_unit.cpp
+++ b/sim/simx/func_unit.cpp
@@ -21,6 +21,7 @@
 #include "core.h"
 #include "constants.h"
 #include "cache_sim.h"
+#include "VX_types.h"
 
 using namespace vortex;
 
@@ -162,7 +163,7 @@ void LsuUnit::tick() {
 			continue;
 		}
 
-		bool is_write = (trace->lsu_type == LsuType::STORE);
+		bool is_write = ((trace->lsu_type == LsuType::STORE) || (trace->lsu_type == LsuType::TCU_STORE));
 
 		// check pending queue capacity
 		if (!is_write && state.pending_rd_reqs.full()) {
@@ -175,13 +176,14 @@ void LsuUnit::tick() {
 		}
 
 		uint32_t tag = 0;
+
 		if (!is_write) {
 			tag = state.pending_rd_reqs.allocate({trace, 0});
 		}
 
 		// send memory request
 		auto num_reqs = this->send_requests(trace, block_idx, tag);
-
+		
 		if (!is_write) {
 			state.pending_rd_reqs.at(tag).count = num_reqs;
 		}
@@ -200,7 +202,14 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
 	int count = 0;
 
 	auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
-	bool is_write = (trace->lsu_type == LsuType::STORE);
+	bool is_write = ((trace->lsu_type == LsuType::STORE) || (trace->lsu_type == LsuType::TCU_STORE));
+
+	uint16_t req_per_thread = 1;
+	if ((trace->lsu_type == LsuType::TCU_LOAD) || (trace->lsu_type == LsuType::TCU_STORE))
+	{
+ 		req_per_thread= (1>(trace_data->mem_addrs.at(0).size)/4)? 1: ((trace_data->mem_addrs.at(0).size)/4);
+	}
+	
 	auto t0 = trace->pid * NUM_LSU_LANES;
 
 	for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
@@ -213,33 +222,69 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
 
 		auto mem_addr = trace_data->mem_addrs.at(t);
 		auto type = get_addr_type(mem_addr.addr);
-
-		MemReq mem_req;
-		mem_req.addr  = mem_addr.addr;
-		mem_req.write = is_write;
-		mem_req.type  = type;
-		mem_req.tag   = tag;
-		mem_req.cid   = trace->cid;
-		mem_req.uuid  = trace->uuid;
-
-		dcache_req_port.push(mem_req, 1);
-		DT(3, "mem-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
-			<< ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace);
-
-		if (is_write) {
-			++core_->perf_stats_.stores;
-		} else {
-			++core_->perf_stats_.loads;
-			++pending_loads_;
+		// DT(3, "addr_type = " << type << ", " << *trace);		
+		uint32_t mem_bytes = 1;
+		for (int i = 0; i < req_per_thread; i++)
+		{
+			MemReq mem_req;
+			mem_req.addr  = mem_addr.addr + (i*mem_bytes);
+			mem_req.write = is_write;
+			mem_req.type  = type;
+			mem_req.tag   = tag;
+			mem_req.cid   = trace->cid;
+			mem_req.uuid  = trace->uuid;
+		
+			dcache_req_port.push(mem_req, 1);
+			DT(3, "mem-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
+				<< ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace);
+
+			if (is_write) {
+				++core_->perf_stats_.stores;
+			} else {
+				++core_->perf_stats_.loads;
+				++pending_loads_;
+			}
+		
+			++count;
 		}
-
-		++count;
 	}
 	return count;
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 
+TcuUnit::TcuUnit(const SimContext& ctx, Core* core) 
+    : FuncUnit(ctx, core, "TCU")
+    , tc_size (core_->arch().tc_size())
+    {}
+
+void TcuUnit::tick() {
+
+	for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
+        auto& input = Inputs.at(i);
+        if (input.empty()) 
+            continue;
+        auto& output = Outputs.at(i);
+        auto trace = input.front();
+        uint32_t n_tiles = core_->emulator_.get_tiles();
+        switch (trace->tcu_type) {
+            case TCUType::TCU_MUL:
+            {    //mat size = n_tiles * tc_size
+                int matmul_latency = (n_tiles * tc_size) + tc_size + tc_size;
+                output.push(trace, matmul_latency);
+				DT(3, "matmul_latency = " << matmul_latency << ", " << *trace);
+                break;
+            }
+            default:
+                std::abort();
+        }    
+        DT(3, "pipeline-execute: op=" << trace->tcu_type << ", " << *trace);
+        input.pop();
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
 SfuUnit::SfuUnit(const SimContext& ctx, Core* core)
 	: FuncUnit(ctx, core, "SFU")
 {}
diff --git a/sim/simx/func_unit.h b/sim/simx/func_unit.h
index 45f0152ff..5fc922991 100644
--- a/sim/simx/func_unit.h
+++ b/sim/simx/func_unit.h
@@ -100,6 +100,15 @@ class LsuUnit : public FuncUnit {
 
 ///////////////////////////////////////////////////////////////////////////////
 
+class TcuUnit : public FuncUnit {
+public:
+    TcuUnit(const SimContext& ctx, Core*);
+    uint64_t tc_size;
+    void tick();
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
 class SfuUnit : public FuncUnit {
 public:
 	SfuUnit(const SimContext& ctx, Core*);
diff --git a/sim/simx/instr.h b/sim/simx/instr.h
index f97a19eac..061b4deb0 100644
--- a/sim/simx/instr.h
+++ b/sim/simx/instr.h
@@ -46,7 +46,7 @@ enum class Opcode {
   EXT1      = 0x0b,
   EXT2      = 0x2b,
   EXT3      = 0x5b,
-  EXT4      = 0x7b
+  TCU       = 0x7b
 };
 
 enum class InstType {
diff --git a/sim/simx/instr_trace.h b/sim/simx/instr_trace.h
index 532b736f5..9d6859fb7 100644
--- a/sim/simx/instr_trace.h
+++ b/sim/simx/instr_trace.h
@@ -75,6 +75,7 @@ struct instr_trace_t {
     AluType  alu_type;
     FpuType  fpu_type;
     SfuType  sfu_type;
+    TCUType  tcu_type; 
   };
 
   ITraceData::Ptr data;
diff --git a/sim/simx/main.cpp b/sim/simx/main.cpp
index 0f61de6f4..58eb96d61 100644
--- a/sim/simx/main.cpp
+++ b/sim/simx/main.cpp
@@ -35,6 +35,8 @@ static void show_usage() {
 uint32_t num_threads = NUM_THREADS;
 uint32_t num_warps = NUM_WARPS;
 uint32_t num_cores = NUM_CORES;
+uint32_t tc_size = TC_SIZE;
+uint32_t  tc_num = TC_NUM;
 bool showStats = false;
 const char* program = nullptr;
 
@@ -81,7 +83,7 @@ int main(int argc, char **argv) {
 
   {
     // create processor configuation
-    Arch arch(num_threads, num_warps, num_cores);
+    Arch arch(num_threads, num_warps, num_cores, tc_size, tc_num);
 
     // create memory module
     RAM ram(0, RAM_PAGE_SIZE);
diff --git a/sim/simx/types.h b/sim/simx/types.h
index a84216ae1..15623ce39 100644
--- a/sim/simx/types.h
+++ b/sim/simx/types.h
@@ -23,6 +23,7 @@
 #include <VX_config.h>
 #include <simobject.h>
 #include "debug.h"
+#include <iostream>
 
 namespace vortex {
 
@@ -78,6 +79,7 @@ enum class FUType {
   LSU,
   FPU,
   SFU,
+  TCU,
   Count
 };
 
@@ -87,6 +89,7 @@ inline std::ostream &operator<<(std::ostream &os, const FUType& type) {
   case FUType::LSU: os << "LSU"; break;
   case FUType::FPU: os << "FPU"; break;
   case FUType::SFU: os << "SFU"; break;
+  case FUType::TCU: os << "TCU"; break;
   default: assert(false);
   }
   return os;
@@ -118,14 +121,30 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
 
 enum class LsuType {
   LOAD,
+  TCU_LOAD,
   STORE,
+  TCU_STORE,
   FENCE
 };
 
+enum class TCUType {
+  TCU_MUL
+};
+
+inline std::ostream &operator<<(std::ostream &os, const TCUType& type) {
+  switch (type) {
+  case TCUType::TCU_MUL: os << "TCU MUL"; break;
+  default: assert(false);
+  }
+  return os;
+}
+
 inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
   switch (type) {
   case LsuType::LOAD:  os << "LOAD"; break;
+  case LsuType::TCU_LOAD: os << "TCU_LOAD"; break;
   case LsuType::STORE: os << "STORE"; break;
+  case LsuType::TCU_STORE: os << "TCU_STORE"; break;
   case LsuType::FENCE: os << "FENCE"; break;
   default: assert(false);
   }
@@ -383,7 +402,7 @@ class Mux : public SimObject<Mux<Type>> {
     , type_(type)
     , delay_(delay)
     , cursors_(num_outputs, 0)
-    , num_reqs_(num_inputs / num_outputs)
+    , num_reqs_(log2ceil(num_inputs / num_outputs))
   {
     assert(delay != 0);
     assert(num_inputs <= 32);
@@ -407,7 +426,7 @@ class Mux : public SimObject<Mux<Type>> {
   void tick() {
     uint32_t I = Inputs.size();
     uint32_t O = Outputs.size();
-    uint32_t R = num_reqs_;
+    uint32_t R = 1 << num_reqs_;
 
     // skip bypass mode
     if (I == O)
diff --git a/tests/regression/matmul/Makefile b/tests/regression/matmul/Makefile
new file mode 100644
index 000000000..7f1c48523
--- /dev/null
+++ b/tests/regression/matmul/Makefile
@@ -0,0 +1,14 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := matmul
+
+SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cpp
+
+VX_SRCS := $(SRC_DIR)/kernel.cpp
+
+OPTS ?= -n128 -d1
+
+include ../common.mk
diff --git a/tests/regression/matmul/common.h b/tests/regression/matmul/common.h
new file mode 100644
index 000000000..a9aa5de6c
--- /dev/null
+++ b/tests/regression/matmul/common.h
@@ -0,0 +1,17 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+typedef struct {
+  uint32_t num_tasks;
+  uint32_t num_warps;
+  uint32_t num_threads;
+  uint32_t TC_per_warp;
+  uint32_t matrix_size;
+  uint32_t data_size;
+  uint64_t tc_size;
+  uint64_t src0_addr;
+  uint64_t src1_addr;
+  uint64_t dst_addr;
+} kernel_arg_t;
+
+#endif
\ No newline at end of file
diff --git a/tests/regression/matmul/kernel.cpp b/tests/regression/matmul/kernel.cpp
new file mode 100644
index 000000000..eeb902acb
--- /dev/null
+++ b/tests/regression/matmul/kernel.cpp
@@ -0,0 +1,124 @@
+#include <stdint.h>
+#include <vx_intrinsics.h>
+#include <vx_spawn.h>
+#include "common.h"
+
+void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
+	uint32_t task_id = blockIdx.x;
+	int32_t* src0_ptr = (int32_t*)arg->src0_addr;
+	int32_t* src1_ptr = (int32_t*)arg->src1_addr;
+	int32_t* dst_ptr  = (int32_t*)arg->dst_addr;
+	unsigned a_addr = reinterpret_cast<unsigned>(src0_ptr);
+	unsigned b_addr = reinterpret_cast<unsigned>(src1_ptr);
+	unsigned c_addr = reinterpret_cast<unsigned>(dst_ptr);
+
+	uint32_t tc_size = arg->tc_size;
+	int TC_per_warp = arg->TC_per_warp;
+	unsigned num_threads = arg->num_threads;
+	int num_warps = arg->num_warps;
+	uint32_t matrix_size = arg->matrix_size;
+	
+	int n_tiles = matrix_size/tc_size;
+	int num_output_tiles = (matrix_size*matrix_size)/(tc_size*tc_size);
+	
+	int num_tasks = arg->num_tasks;
+
+	//Assuming matrix size always > tensor core size
+	int warps_actual;
+	if (TC_per_warp > num_output_tiles)
+		warps_actual = 1;
+	else 
+		warps_actual = num_output_tiles/TC_per_warp;
+
+	int num_warps_actual = (warps_actual < num_warps)? warps_actual: num_warps;
+	int num_threads_per_tc = (1> num_threads/TC_per_warp)? 1: num_threads/TC_per_warp;
+	
+	int num_tasks_per_thread = (1> (num_tasks/(num_threads*num_warps_actual)))? 1: (num_tasks/(num_threads*num_warps_actual));
+	int num_tasks_per_warp = (1 > num_tasks/num_warps_actual)? 1:num_tasks/num_warps_actual;
+	int task_id_first_warp = task_id%num_tasks_per_warp;
+
+	//A&B
+	int num_data_per_op_tile = tc_size*tc_size*n_tiles;
+	int num_data_per_warp = num_data_per_op_tile*((1> (num_output_tiles/num_warps_actual))?1:(num_output_tiles/num_warps_actual));
+	
+	int addr_shift;
+	if (((tc_size*tc_size*n_tiles)/(num_threads)) > 1)
+		addr_shift = (tc_size*tc_size*n_tiles)/(num_threads);
+	else
+		addr_shift = 1;
+	//Offset for 1st warp
+	int offset = ((task_id_first_warp/num_tasks_per_thread)*addr_shift) + ((task_id_first_warp%num_tasks_per_thread)*num_data_per_op_tile);
+	offset = offset + (num_data_per_warp*(task_id/num_tasks_per_warp));
+
+	//C
+	int num_data_per_op_tile_c = tc_size*tc_size;
+	int num_data_per_warp_c = num_data_per_warp/n_tiles;
+	
+	int addr_shift_c;
+	if (((tc_size*tc_size)/(num_threads)) > 1)
+		addr_shift_c = tc_size;
+	else
+		addr_shift_c = 1;
+	//Offset for 1st warp
+	int offset_c = ((task_id_first_warp/num_tasks_per_thread)*addr_shift_c) + ((task_id_first_warp%num_tasks_per_thread)*num_data_per_op_tile_c);
+	offset_c = offset_c + (num_data_per_warp_c*(task_id/num_tasks_per_warp));
+	
+	int thread_limit = (num_threads < tc_size*tc_size*n_tiles*TC_per_warp)? num_threads : tc_size*tc_size*n_tiles*TC_per_warp;
+	int thread_limit_c = (num_threads<tc_size*tc_size)? num_threads:tc_size*tc_size;
+	
+	//OLD TASK DISTRIBUTION // For 8x8 matrix, 2x2 tc_size, 1 tc_num, 4threads, 2warps => 64 tasks => 32 tasks/warp => 8 tasks/thread
+	/*task0->thread0, warp0
+	task1->thread0 , warp0
+	task2->thread0 , warp0
+	.
+	task7->thread0 
+	task8->thread1
+	task9->thread1 
+	.
+	.
+	------
+	task32 -> thread0, warp1
+	task33 -> thread1, warp1
+	.
+	*/
+
+	//NEW TASK DISTRIBUTION // For 8x8 matrix, 2x2 tc_size, 1 tc_num, 4threads, 2warps => 64 tasks => 32 tasks/warp => 8 tasks/thread
+	/*task0->thread0, warp0
+	task1->thread1 , warp0
+	task2->thread2 , warp0
+	task3->thread3 ,...
+	task4->thread0
+	task5->thread1 
+	.
+	.
+	------
+	task32 -> thread0, warp1
+	task33 -> thread1, warp1
+	.
+	.*/
+	
+	//TODO :: change this for new task->thread distribution
+	if (((task_id%num_tasks_per_warp)/num_tasks_per_thread) < thread_limit)
+	{	
+		unsigned a_addr_base = a_addr + offset*arg->data_size;
+		unsigned b_addr_base = b_addr + offset*arg->data_size;
+		unsigned c_addr_base = c_addr + offset_c*arg->data_size;
+		csr_write(VX_MAT_MUL_SIZE,n_tiles);
+		mload (0, a_addr_base);
+		mload (1, b_addr_base);
+		//In case of multiple threads - sync load
+		vx_fence();
+
+		mm();   //Assuming padding to ensure matrix size is a multiple of tc_size
+		vx_fence();
+		if (((task_id%num_tasks_per_warp)/num_tasks_per_thread) < thread_limit_c)
+			ms(c_addr_base);
+		//In case of multiple threads - sync store
+		vx_fence();
+	}	
+}
+
+int main() {
+	kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
+	return vx_spawn_threads(1, &arg->num_tasks, nullptr, (vx_kernel_func_cb)kernel_body, arg);
+}
diff --git a/tests/regression/matmul/main.cpp b/tests/regression/matmul/main.cpp
new file mode 100644
index 000000000..6a86712ae
--- /dev/null
+++ b/tests/regression/matmul/main.cpp
@@ -0,0 +1,343 @@
+#include <iostream>
+#include <unistd.h>
+#include <string.h>
+#include <vector>
+#include <chrono>
+#include <vortex.h>
+#include <cmath>
+#include "common.h"
+
+#define RT_CHECK(_expr)                                         \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);   \
+	 cleanup();			                                              \
+     exit(-1);                                                  \
+   } while (false)
+
+///////////////////////////////////////////////////////////////////////////////
+
+const char* kernel_file = "kernel.vxbin";
+uint32_t matrix_size = 0;
+vx_device_h device = nullptr;
+vx_buffer_h A_buffer = nullptr;
+vx_buffer_h B_buffer = nullptr;
+vx_buffer_h C_buffer = nullptr;
+vx_buffer_h krnl_buffer = nullptr;
+vx_buffer_h args_buffer = nullptr;
+
+std::vector<uint8_t> staging_buf;
+kernel_arg_t kernel_arg = {};
+
+static void show_usage() {
+   std::cout << "Vortex Test." << std::endl;
+   std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
+}
+
+static void parse_args(int argc, char **argv, uint32_t &data_size) {
+  int c;
+  while ((c = getopt(argc, argv, "n:k:d:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      matrix_size = atoi(optarg);
+      break;
+    case 'k':
+      kernel_file = optarg;
+      break;
+    case 'd':
+      data_size = atoi(optarg);
+      break;  
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+void cleanup() {
+  if (device) {
+    vx_mem_free(A_buffer);
+    vx_mem_free(B_buffer);
+    vx_mem_free(C_buffer);
+    vx_mem_free(krnl_buffer);
+    vx_mem_free(args_buffer);
+    vx_dev_close(device);
+  }
+}
+
+template<typename TYPE>
+class mainVariables
+{
+  public:
+    // Constructor
+    mainVariables(uint32_t bufSize, uint32_t dataSize, uint32_t matrixSize)
+        : buf_size(bufSize), data_size(dataSize), matrix_size(matrixSize)
+    {
+        // Resize vectors to specified sizes
+        src_A.resize(buf_size/data_size);
+        src_B.resize(buf_size/data_size);
+        refs.resize(buf_size/data_size);
+    }
+
+  void init_inputs ()
+  {
+    std::cout << "inside init" << std::endl;
+    for (uint32_t i = 0; i < matrix_size*matrix_size; ++i) 
+    {
+      auto a = static_cast<float>(std::rand()) / RAND_MAX;
+      auto b = static_cast<float>(std::rand()) / RAND_MAX;
+      src_A[i] = static_cast<TYPE>(a * matrix_size);
+      src_B[i] = static_cast<TYPE>(b * matrix_size);
+    }
+  }
+
+  void matmul_cpu() 
+  {
+    for (uint32_t row = 0; row < matrix_size; ++row) 
+    {
+      for (uint32_t col = 0; col < matrix_size; ++col) 
+      {
+        TYPE sum(0);
+        for (uint32_t e = 0; e < matrix_size; ++e) {
+            sum += src_A[row * matrix_size + e] * src_B[e * matrix_size + col];
+        }
+        refs[row * matrix_size + col] = sum;
+      }
+    }
+  }
+
+  //Public variables
+  std::vector<TYPE> src_A;
+  std::vector<TYPE> src_B;
+  std::vector<TYPE> refs;
+
+  std::vector<uint8_t> A_mat;
+  std::vector<uint8_t> B_mat;
+
+  private:
+    uint32_t buf_size;
+    uint32_t data_size;
+    uint32_t matrix_size;
+};
+
+
+
+int main(int argc, char *argv[]) {  
+  // parse command arguments
+  uint32_t data_size = 0;
+  parse_args(argc, argv, data_size);
+  if (matrix_size == 0) {
+    matrix_size = 2;
+  }
+
+  // open device connection
+  std::cout << "open device connection" << std::endl;  
+  RT_CHECK(vx_dev_open(&device));
+
+  uint64_t num_cores, num_warps, num_threads, tc_size, TC_per_warp;
+  RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
+  RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
+  RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
+  RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size));
+  RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp));
+
+  std::cout << "Debug :: tc_size = " << tc_size << std::endl;
+  std::cout << "Debug :: tc_num = " << TC_per_warp << std::endl;
+
+  int threads_per_tc;
+  //TODO - can be changed
+  //Number of output tiles * number of threads
+  if (TC_per_warp > num_threads)
+    threads_per_tc = 1;
+  else
+    threads_per_tc = num_threads/TC_per_warp;
+
+  uint32_t num_tasks  = ((matrix_size*matrix_size)/(tc_size*tc_size))*threads_per_tc;
+
+  //size of each operand
+  uint32_t buf_size   =  ((matrix_size*matrix_size)/(tc_size*tc_size))*(matrix_size/(tc_size))*(tc_size*tc_size)*data_size;
+
+  //256
+  std::cout << "Debug :: buf_size: " << buf_size << " bytes" << std::endl;
+
+  // allocate device memory
+  std::cout << "allocate device memory" << std::endl;
+
+  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &A_buffer));
+  RT_CHECK(vx_mem_address(A_buffer, &kernel_arg.src0_addr));
+  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &B_buffer));
+  RT_CHECK(vx_mem_address(B_buffer, &kernel_arg.src1_addr));
+  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &C_buffer));
+  RT_CHECK(vx_mem_address(C_buffer, &kernel_arg.dst_addr));
+
+  std::cout << "A_addr=0x" << std::hex << kernel_arg.src0_addr << std::endl;
+  std::cout << "B_addr=0x" << std::hex << kernel_arg.src1_addr << std::endl;
+  std::cout << "C_addr=0x" << std::hex << kernel_arg.dst_addr << std::endl;
+
+  mainVariables<int> variables (buf_size, data_size, matrix_size);
+  variables.init_inputs();
+  
+  //////////////////////////////////////////////////
+  // generate source data
+  //////////////////////////////////////////////////
+  variables.matmul_cpu();
+
+  uint32_t tc_size_f = tc_size*tc_size;
+  uint32_t n_tiles = matrix_size/tc_size;
+  
+  variables.A_mat.resize(buf_size);
+  variables.B_mat.resize(buf_size);
+
+  //Demand matrix creation for A / traverse through the rows
+  for(uint32_t k=0; k<n_tiles; k++)
+  {
+    //traverse through output tiles in a row
+    for(uint32_t i=0; i<n_tiles; i++)
+    {
+      //traverse through tiles for one output tile
+        for(uint32_t j=0; j< n_tiles; j++)
+        {
+          for(int t=0; t < tc_size*tc_size; t++)
+          { 
+            variables.A_mat[n_tiles*n_tiles*tc_size_f*k + n_tiles*tc_size_f*i+tc_size_f*j + t]   = variables.src_A[k*tc_size*matrix_size+ tc_size*j +(t/tc_size)*matrix_size + t%tc_size];
+          }
+        }
+    }
+  }
+  
+  //Demand matrix creation for B / traverse through the rows
+  for(uint32_t k=0; k<n_tiles; k++)
+  {
+    //traverse through output tiles in a row
+    for(uint32_t i=0; i<n_tiles; i++)
+    {
+      //traverse through tiles for one output tile
+      for(uint32_t j=0; j< n_tiles; j++)
+      {
+        for(int t=0; t < tc_size*tc_size; t++)
+        {
+          variables.B_mat[n_tiles*n_tiles*tc_size_f*k + n_tiles*tc_size_f*i+tc_size_f*j + t]   = variables.src_B[i*tc_size+ tc_size*matrix_size*j +(t/tc_size)*matrix_size + t%tc_size];
+        }
+      }
+    }
+  }
+  
+  //////////////////////////////////////////////////
+  //////////////////////////////////////////////////
+
+  // upload matrix A buffer
+  {
+    std::cout << "upload matrix A buffer" << std::endl;
+    RT_CHECK(vx_copy_to_dev(A_buffer, (int8_t*)variables.A_mat.data(), 0, buf_size));
+  }
+
+  // upload matrix B buffer
+  {
+    std::cout << "upload matrix B buffer" << std::endl;
+    RT_CHECK(vx_copy_to_dev(B_buffer, (int8_t*)variables.B_mat.data(), 0, buf_size));
+  }
+
+  // upload program
+  std::cout << "upload program" << std::endl;
+  RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
+
+  //////////////////////////////////////////////////
+  //Prep kernel arguments
+  //////////////////////////////////////////////////  
+  //1
+  std::cout << "Debug :: num_tasks = " << num_tasks << std::endl;
+  kernel_arg.num_tasks = num_tasks;
+  kernel_arg.num_warps = num_warps;
+  kernel_arg.num_threads = num_threads;
+  kernel_arg.TC_per_warp = TC_per_warp;
+  //1
+  kernel_arg.matrix_size = matrix_size;
+  kernel_arg.data_size = data_size;
+  kernel_arg.tc_size = tc_size;
+
+  std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
+  std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
+  std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
+
+  //////////////////////////////////////////////////  
+  //////////////////////////////////////////////////
+  
+  // upload kernel argument
+  std::cout << "upload kernel argument" << std::endl;
+  RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
+
+  auto time_start = std::chrono::high_resolution_clock::now();
+
+    std::cout << "start device" << std::endl;
+    RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
+
+    // wait for completion
+    std::cout << "wait for completion" << std::endl;
+    RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
+
+  auto time_end = std::chrono::high_resolution_clock::now();
+  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
+  printf("Elapsed time: %lg ms\n", elapsed);
+
+  // download destination buffer
+  std::cout << "download destination buffer" << std::endl;
+  RT_CHECK(vx_copy_from_dev((int8_t*)variables.B_mat.data(), C_buffer, 0, buf_size));
+
+  // verify result (TODO : needs to be fixed for for functional correctness)
+  /*
+  std::cout << "verify result" << std::endl;  
+  {
+    int errors = 0;
+    auto buf_ptr = (int8_t*)staging_buf.data();
+    uint64_t tc_size = kernel_arg.tc_size;
+    std::cout << "tc_size = " << tc_size << std::endl;
+    int Result[matrix_size*matrix_size];
+    int n_tiles = (matrix_size/tc_size);
+    int tc_size_f = tc_size*tc_size;
+
+    //converting buf ptr (tile by tile) to CPU style linear (row by row)
+    for(int k = 0; k < matrix_size/tc_size; k+= 1)
+    {
+      for(int j = 0; j < matrix_size; j+= tc_size)
+      {
+        for(int i =0; i < tc_size*tc_size; i++)
+        {
+          Result[ tc_size*matrix_size*k +j+ (i/tc_size)*matrix_size +i%(tc_size)]  = buf_ptr[matrix_size*tc_size*k+tc_size*j+i];
+        }
+      }    
+    }
+
+    for (uint32_t i = 0; i < matrix_size*matrix_size; ++i) {
+      //int ref = i + i; 
+      int cur = Result[i];
+      if (cur != refs[i]) {
+        ++errors;
+      }
+    }
+    if (errors != 0) {
+      std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
+      std::cout << "FAILED!" << std::endl;
+      return 1;  
+    }
+    else
+    {
+      std::cout << "CONDITIONALLY PASSED!" << std::endl;
+    }
+  }
+  */
+
+  // cleanup
+  std::cout << "cleanup" << std::endl;  
+  cleanup();
+
+  std::cout << "PASSED!" << std::endl;
+
+  return 0;
+}
\ No newline at end of file

From 0e3badf723c9e2d03013237433aa0838b21c7f99 Mon Sep 17 00:00:00 2001
From: Varsha Singhania <vsinghania8@gatech.edu>
Date: Tue, 18 Jun 2024 02:19:57 -0400
Subject: [PATCH 2/7] Script checkin and code cleanup

---
 run_final.sh         | 22 ++++++++++++++++++++++
 sim/simx/execute.cpp | 28 +++++++++-------------------
 2 files changed, 31 insertions(+), 19 deletions(-)
 create mode 100755 run_final.sh

diff --git a/run_final.sh b/run_final.sh
new file mode 100755
index 000000000..5f618dc64
--- /dev/null
+++ b/run_final.sh
@@ -0,0 +1,22 @@
+# Define arrays for threads, warps, and matrix sizes
+matrix_sizes=(16 32 64 128 256 512)
+tcsizes=(8 16 32)
+tcnums=(4 8 16 32)
+#lsulanes=(4 16)
+#cores=(32)
+
+
+# Loop through each combination of threads and warps
+for size in "${matrix_sizes[@]}"; do
+    sed -i "s/OPTS ?= -n[0-9]\+/OPTS ?= -n${size}/" ../tests/regression/matmul/Makefile
+    sed -i "s/OPTS ?= -n[0-9]\+/OPTS ?= -n${size}/" tests/regression/matmul/Makefile
+    echo "Matrix size changed to ${size} in Makefile"
+    for tcsize in "${tcsizes[@]}"; do
+        for tcnum in "${tcnums[@]}"; do
+            log_name="sim_final/mat${size}/tcsize${tcsize}_tcnum${tcnum}_32w32t"
+            command="./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --tc_size=${tcsize} --tc_num=${tcnum} --rebuild=1 --perf=1  > ${log_name} 2>&1"
+            echo "$command"
+            eval "$command"
+        done
+    done    
+done
diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp
index d522145db..e13df18b9 100644
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@@ -1432,8 +1432,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
     uint32_t data_bytes_store;
     uint32_t num_threads_per_tc = MAX (1, num_threads/TC_per_warp);
 
-    //int num_warps = MIN()
-    //int active_tcs =  MIN (TC_per_warp, num_output_tiles/num_warps)
     //LOAD
     if(num_threads > tc_size*tc_size*n_tiles*TC_per_warp)
     { 
@@ -1448,11 +1446,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
     data_bytes_load = mem_bytes*num_data_per_thread;
 
     //STORE
-    
-    // DP(3, "DEBUG :: num_threads = " << num_threads);
-    // DP(3, "DEBUG :: tc_size*tc_size = " << tc_size*tc_size);
-    //DP(3, "imm = " << immsrc);
-    
     if(num_threads > tc_size*tc_size*TC_per_warp)
     { 
       num_threads_actv_st = tc_size*tc_size*TC_per_warp;
@@ -1499,8 +1492,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
               scratchpad[loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n] = *temp_ref;
               DP(3, "Scratchpad Index: " << loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n << ", Value: " << scratchpad[loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n]);
             }
-            //loop_offset += tc_size*tc_size;
-          //}
         }
         rd_write = true;  
       } break;
@@ -1531,7 +1522,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
             uint32_t csr_index = (2*num_data_per_thread_st) + n;
             uint32_t scratchpad_index = (tc_size*tc_size*2) + (t*num_data_per_thread) + n;
             
-            //scratchpad -> csr (TODO :: can intermediate step of moving to CSR be skipped?)
+            //scratchpad -> csr (TODO :: removed intermediate CSR stage ; incorporate limited scratchmad implementation)
             //core_->set_csr(csr_addr[(2*num_data_per_thread) + n], scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread) + n], t, warp_id_);
             Word* temp_ref = &(warp.ireg_file.at(t).at(rsrc0));
             *temp_ref = scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread_st) + n];
@@ -1562,14 +1553,14 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
           //TC operation [only 1 thread in 1 warp needs to do this]
           if (t%threads_per_tc == 0)
           {
-            //TODO - change to systolic array implementation
+            //TODO : change to systolic array implementation
             uint32_t thread_offset = t*(tc_size*tc_size);
             int loop_offset = 0;
             int offset_b = n_tiles*n_tiles*n_tiles*tc_size*tc_size;
-            // Loop over all tiles - output stationary
-            //for(int tiles = 0 ; tiles < n_tiles ; tiles++)  //What's the HW implication of this?? A counter implementation?
-            //{ 
-              /*
+            /*
+            // TODO : Fix needed for functional correctness
+            for(int tiles = 0 ; tiles < n_tiles ; tiles++)  //What's the HW implication of this?? A counter implementation?
+            { 
               for (int i = 0; i < tc_size; i++) { //ROW-1
                 for (int j = 0; j < tc_size; j++) { //COL-2
                   int sum = 0;
@@ -1579,12 +1570,11 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
                   }
                   scratchpad[accu_offset + thread_offset +(i * tc_size + j)] += sum; //[i * col2 + j] = sum
                   DP(3, "Scratchpad Index: " << accu_offset + (i * tc_size + j) << " , Value=" << scratchpad[accu_offset + (i * tc_size + j)]);
-
                 }
               }
-              */
-              //loop_offset += tc_size*tc_size; //Move to the next tiled matmul fragment
-            //}
+              loop_offset += tc_size*tc_size; //Move to the next tiled matmul fragment
+            }
+            */
           }
         }
 

From a378aed67cc7d891201124eb5b51059ec3989252 Mon Sep 17 00:00:00 2001
From: Nayan Sivakumar Nair <nayan.nair98@gmail.com>
Date: Fri, 21 Jun 2024 22:23:24 -0400
Subject: [PATCH 3/7] Moved tc_num, tc_size param to makefile args

---
 ci/blackbox.sh                     | 10 +---------
 ci/regression.sh.in                |  1 +
 hw/rtl/VX_types.vh                 |  3 +++
 runtime/simx/vortex.cpp            | 14 +++++++-------
 sim/simx/arch.h                    | 14 +-------------
 sim/simx/emulator.cpp              | 16 +++++++++++++++-
 sim/simx/emulator.h                |  3 +++
 sim/simx/execute.cpp               |  7 +++++--
 sim/simx/func_unit.cpp             |  4 +++-
 sim/simx/func_unit.h               |  2 +-
 sim/simx/main.cpp                  |  2 +-
 tests/regression/matmul/Makefile   |  2 +-
 tests/regression/matmul/kernel.cpp |  5 ++++-
 tests/regression/matmul/main.cpp   | 29 ++++++++++++++++++++++++-----
 14 files changed, 70 insertions(+), 42 deletions(-)

diff --git a/ci/blackbox.sh b/ci/blackbox.sh
index 8a04133f9..defad4c05 100755
--- a/ci/blackbox.sh
+++ b/ci/blackbox.sh
@@ -114,14 +114,6 @@ case $i in
         LOGFILE=${i#*=}
         shift
         ;;
-    --tc_size=*)
-        TC_SIZE=${i#*=}
-        shift
-        ;;
-    --tc_num=*)
-        TC_NUM=${i#*=}
-        shift
-        ;;
     --help)
         show_help
         exit 0
@@ -190,7 +182,7 @@ then
 fi
 
 CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS"
-CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DTC_NUM=$TC_NUM -DTC_SIZE=$TC_SIZE $L2 $L3 $PERF_FLAG $CONFIGS"
+# CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DTC_NUM=$TC_NUM -DTC_SIZE=$TC_SIZE $L2 $L3 $PERF_FLAG $CONFIGS"
 echo "CONFIGS=$CONFIGS"
 
 if [ $REBUILD -ne 0 ]
diff --git a/ci/regression.sh.in b/ci/regression.sh.in
index a5f1bffdb..50d309af6 100755
--- a/ci/regression.sh.in
+++ b/ci/regression.sh.in
@@ -124,6 +124,7 @@ regression()
     # test local barrier
     ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar"
     ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -tbar"
+    
 
     echo "regression tests done!"
 }
diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh
index 06929b058..9a8f93234 100644
--- a/hw/rtl/VX_types.vh
+++ b/hw/rtl/VX_types.vh
@@ -197,6 +197,9 @@
 `define VX_CSR_LOCAL_MEM_BASE           12'hFC3
 
 `define VX_MAT_MUL_SIZE                 12'hFC4
+`define VX_TC_NUM                       12'hFC5
+`define VX_TC_SIZE                      12'hFC6
+
 
 
 `endif // VX_TYPES_VH
diff --git a/runtime/simx/vortex.cpp b/runtime/simx/vortex.cpp
index f65d7b385..4210ab0b6 100644
--- a/runtime/simx/vortex.cpp
+++ b/runtime/simx/vortex.cpp
@@ -32,7 +32,7 @@ using namespace vortex;
 class vx_device {
 public:
   vx_device()
-    : arch_(NUM_THREADS, NUM_WARPS, NUM_CORES, TC_SIZE, TC_NUM)
+    : arch_(NUM_THREADS, NUM_WARPS, NUM_CORES)
     , ram_(0, RAM_PAGE_SIZE)
     , processor_(arch_)
     , global_mem_(ALLOC_BASE_ADDR,
@@ -69,12 +69,12 @@ class vx_device {
     case VX_CAPS_NUM_CORES:
       _value = NUM_CORES * NUM_CLUSTERS;
       break;
-    case VX_CAPS_TC_SIZE:
-      _value = TC_SIZE;
-      break;
-    case VX_CAPS_TC_NUM:
-      _value = TC_NUM;
-      break;
+    // case VX_CAPS_TC_SIZE:
+    //   _value = TC_SIZE;
+    //   break;
+    // case VX_CAPS_TC_NUM:
+    //   _value = TC_NUM;
+    //   break;
     case VX_CAPS_CACHE_LINE_SIZE:
       _value = CACHE_BLOCK_SIZE;
       break;
diff --git a/sim/simx/arch.h b/sim/simx/arch.h
index e35687dbd..9af266d7a 100644
--- a/sim/simx/arch.h
+++ b/sim/simx/arch.h
@@ -35,11 +35,9 @@ class Arch {
   uint16_t num_barriers_;
   uint16_t ipdom_size_;
   uint64_t local_mem_base_;
-  uint16_t tc_size_;
-  uint16_t tc_num_;
 
 public:
-  Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint64_t tc_size, uint64_t tc_num)   
+  Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores)   
     : num_threads_(num_threads)
     , num_warps_(num_warps)
     , num_cores_(num_cores)
@@ -51,8 +49,6 @@ class Arch {
     , num_barriers_(NUM_BARRIERS)
     , ipdom_size_((num_threads-1) * 2)
     , local_mem_base_(LMEM_BASE_ADDR)
-    , tc_size_ (tc_size)
-    , tc_num_ (tc_num)
   {}
 
   uint16_t vsize() const {
@@ -98,14 +94,6 @@ class Arch {
   uint16_t socket_size() const {
     return socket_size_;
   }
-  
-  uint16_t tc_size() const {
-    return tc_size_;
-  }
-
-  uint16_t tc_num() const {
-    return tc_num_;
-  }
 
 };
 
diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp
index ea5f72c42..d2faf7f98 100644
--- a/sim/simx/emulator.cpp
+++ b/sim/simx/emulator.cpp
@@ -74,7 +74,7 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
     , core_(core)
     , warps_(arch.num_warps(), arch)
     , barriers_(arch.num_barriers(), 0)
-    , scratchpad(std::vector<Word>(core->arch().tc_size() * core->arch().tc_size() * 32768)) //Fix this
+    , scratchpad(std::vector<Word>(32 * 32 * 32768)) //Fix this : Max TC_SIZE = 32
 {
   this->clear();
 }
@@ -355,6 +355,11 @@ Word Emulator::get_tiles()
   return mat_size;
 }
 
+Word Emulator::get_tc_size()
+{
+  return tc_size;
+}
+
 Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
   auto core_perf = core_->perf_stats();
   switch (addr) {
@@ -387,6 +392,8 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
   case VX_CSR_LOCAL_MEM_BASE: return arch_.local_mem_base();
   case VX_CSR_MSCRATCH:   return csr_mscratch_;
   case VX_MAT_MUL_SIZE:   return mat_size;
+  case VX_TC_NUM:         return tc_num;
+  case VX_TC_SIZE:        return tc_size;
 
   CSR_READ_64(VX_CSR_MCYCLE, core_perf.cycles);
   CSR_READ_64(VX_CSR_MINSTRET, core_perf.instrs);
@@ -500,6 +507,13 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) {
   case VX_MAT_MUL_SIZE:
     mat_size = value;
     break;
+  case VX_TC_NUM:
+    tc_num = value;
+    break;
+  case VX_TC_SIZE:
+    tc_size = value;
+    break;
+  
   default: {
       std::cout << std::hex << "Error: invalid CSR write addr=0x" << addr << ", value=0x" << value << std::endl;
       std::abort();
diff --git a/sim/simx/emulator.h b/sim/simx/emulator.h
index 82b5bc98b..743c2786e 100644
--- a/sim/simx/emulator.h
+++ b/sim/simx/emulator.h
@@ -55,6 +55,7 @@ class Emulator {
   int get_exitcode() const;
   
   Word get_tiles();
+  Word get_tc_size();
 
 private:
 
@@ -125,6 +126,8 @@ class Emulator {
   wspawn_t    wspawn_;
   std::vector<Word> scratchpad;
   uint32_t mat_size;
+  uint32_t tc_size;
+  uint32_t tc_num;
 };
 
 }
diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp
index e13df18b9..0dfd72a0f 100644
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@@ -1419,8 +1419,11 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
   { //TODO - make it data-type flexible
     uint32_t mem_bytes = 1;
     DP(3, "mem_bytes=" << mem_bytes << std::endl);
-    uint16_t tc_size = core_->arch().tc_size();
-    uint32_t TC_per_warp = core_->arch().tc_num();
+    uint16_t tc_size = this->get_csr(VX_TC_SIZE, 0, wid);
+    uint32_t TC_per_warp = this->get_csr(VX_TC_NUM, 0, wid);
+
+    DP(3, "tc_size=" << tc_size << std::endl);
+    DP(3, "TC_per_warp=" << TC_per_warp << std::endl);
 
     //Number of loads - dependant on the thread config
     uint32_t n_tiles = this->get_csr(VX_MAT_MUL_SIZE, 0, wid);  //CSR instruction before MLOAD will ensure that this csr has value
diff --git a/sim/simx/func_unit.cpp b/sim/simx/func_unit.cpp
index 3991a17e7..f53a1fb22 100644
--- a/sim/simx/func_unit.cpp
+++ b/sim/simx/func_unit.cpp
@@ -255,7 +255,7 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
 
 TcuUnit::TcuUnit(const SimContext& ctx, Core* core) 
     : FuncUnit(ctx, core, "TCU")
-    , tc_size (core_->arch().tc_size())
+    // , tc_size (core_->arch().tc_size())
     {}
 
 void TcuUnit::tick() {
@@ -267,6 +267,8 @@ void TcuUnit::tick() {
         auto& output = Outputs.at(i);
         auto trace = input.front();
         uint32_t n_tiles = core_->emulator_.get_tiles();
+		uint32_t tc_size = core_->emulator_.get_tc_size();
+
         switch (trace->tcu_type) {
             case TCUType::TCU_MUL:
             {    //mat size = n_tiles * tc_size
diff --git a/sim/simx/func_unit.h b/sim/simx/func_unit.h
index 5fc922991..a7f182efe 100644
--- a/sim/simx/func_unit.h
+++ b/sim/simx/func_unit.h
@@ -103,7 +103,7 @@ class LsuUnit : public FuncUnit {
 class TcuUnit : public FuncUnit {
 public:
     TcuUnit(const SimContext& ctx, Core*);
-    uint64_t tc_size;
+    // uint64_t tc_size;
     void tick();
 };
 
diff --git a/sim/simx/main.cpp b/sim/simx/main.cpp
index 58eb96d61..9031a0a02 100644
--- a/sim/simx/main.cpp
+++ b/sim/simx/main.cpp
@@ -83,7 +83,7 @@ int main(int argc, char **argv) {
 
   {
     // create processor configuation
-    Arch arch(num_threads, num_warps, num_cores, tc_size, tc_num);
+    Arch arch(num_threads, num_warps, num_cores);
 
     // create memory module
     RAM ram(0, RAM_PAGE_SIZE);
diff --git a/tests/regression/matmul/Makefile b/tests/regression/matmul/Makefile
index 7f1c48523..0ef207194 100644
--- a/tests/regression/matmul/Makefile
+++ b/tests/regression/matmul/Makefile
@@ -9,6 +9,6 @@ SRCS := $(SRC_DIR)/main.cpp
 
 VX_SRCS := $(SRC_DIR)/kernel.cpp
 
-OPTS ?= -n128 -d1
+OPTS ?= -n512 -d1 -s4 -t4
 
 include ../common.mk
diff --git a/tests/regression/matmul/kernel.cpp b/tests/regression/matmul/kernel.cpp
index eeb902acb..a4585fb53 100644
--- a/tests/regression/matmul/kernel.cpp
+++ b/tests/regression/matmul/kernel.cpp
@@ -13,7 +13,7 @@ void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
 	unsigned c_addr = reinterpret_cast<unsigned>(dst_ptr);
 
 	uint32_t tc_size = arg->tc_size;
-	int TC_per_warp = arg->TC_per_warp;
+	uint32_t TC_per_warp = arg->TC_per_warp;
 	unsigned num_threads = arg->num_threads;
 	int num_warps = arg->num_warps;
 	uint32_t matrix_size = arg->matrix_size;
@@ -104,6 +104,9 @@ void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
 		unsigned b_addr_base = b_addr + offset*arg->data_size;
 		unsigned c_addr_base = c_addr + offset_c*arg->data_size;
 		csr_write(VX_MAT_MUL_SIZE,n_tiles);
+		csr_write(VX_TC_NUM,TC_per_warp);
+		csr_write(VX_TC_SIZE,tc_size);
+
 		mload (0, a_addr_base);
 		mload (1, b_addr_base);
 		//In case of multiple threads - sync load
diff --git a/tests/regression/matmul/main.cpp b/tests/regression/matmul/main.cpp
index 6a86712ae..b2238bf5a 100644
--- a/tests/regression/matmul/main.cpp
+++ b/tests/regression/matmul/main.cpp
@@ -21,6 +21,9 @@
 
 const char* kernel_file = "kernel.vxbin";
 uint32_t matrix_size = 0;
+uint32_t tc_num = 4;
+uint32_t TC_size = 8;
+
 vx_device_h device = nullptr;
 vx_buffer_h A_buffer = nullptr;
 vx_buffer_h B_buffer = nullptr;
@@ -38,7 +41,7 @@ static void show_usage() {
 
 static void parse_args(int argc, char **argv, uint32_t &data_size) {
   int c;
-  while ((c = getopt(argc, argv, "n:k:d:h?")) != -1) {
+  while ((c = getopt(argc, argv, "n:k:d:t:s:h?")) != -1) {
     switch (c) {
     case 'n':
       matrix_size = atoi(optarg);
@@ -48,7 +51,13 @@ static void parse_args(int argc, char **argv, uint32_t &data_size) {
       break;
     case 'd':
       data_size = atoi(optarg);
-      break;  
+      break; 
+    case 't':
+      tc_num = atoi(optarg);
+      break;
+    case 's':
+      TC_size = atoi(optarg);
+      break; 
     case 'h':
     case '?': {
       show_usage();
@@ -141,12 +150,22 @@ int main(int argc, char *argv[]) {
   std::cout << "open device connection" << std::endl;  
   RT_CHECK(vx_dev_open(&device));
 
-  uint64_t num_cores, num_warps, num_threads, tc_size, TC_per_warp;
+  uint64_t num_cores, num_warps, num_threads;
+  uint32_t tc_size, TC_per_warp;
+
   RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
   RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
   RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
-  RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size));
-  RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp));
+  
+  std::cout << "Debug :: tc_size (optarg) = " << TC_size << std::endl;
+  std::cout << "Debug :: tc_num (optarg) = " << tc_num << std::endl;
+
+  //Add assert/knob
+  tc_size = TC_size;
+  TC_per_warp = tc_num;
+
+  // RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size));
+  // RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp));
 
   std::cout << "Debug :: tc_size = " << tc_size << std::endl;
   std::cout << "Debug :: tc_num = " << TC_per_warp << std::endl;

From 5b0fc8cbd43c813ced9790b287e8444ab619606b Mon Sep 17 00:00:00 2001
From: Nayan Sivakumar Nair <nayan.nair98@gmail.com>
Date: Tue, 25 Jun 2024 03:18:50 -0400
Subject: [PATCH 4/7] Fixes for PR

---
 ci/blackbox.sh                               |  3 ---
 ci/regression.sh.in                          |  4 ++-
 hw/rtl/VX_config.vh                          |  8 +++---
 hw/rtl/VX_types.vh                           |  2 +-
 kernel/include/vx_intrinsics.h               | 12 ++++-----
 run_final.sh                                 | 22 -----------------
 runtime/simx/vortex.cpp                      | 12 ++++-----
 sim/simx/decode.cpp                          |  6 ++---
 sim/simx/emulator.cpp                        | 10 +++++++-
 sim/simx/emulator.h                          |  3 ++-
 sim/simx/execute.cpp                         | 22 ++++++-----------
 sim/simx/func_unit.cpp                       |  1 -
 sim/simx/func_unit.h                         |  1 -
 sim/simx/main.cpp                            |  2 --
 tests/regression/matmul/Makefile             |  2 +-
 tests/regression/matmul/kernel.cpp           |  8 +++---
 tests/regression/matmul/main.cpp             | 22 +++--------------
 tests/regression/matmul/matmul_regression.sh | 26 ++++++++++++++++++++
 18 files changed, 76 insertions(+), 90 deletions(-)
 delete mode 100755 run_final.sh
 create mode 100755 tests/regression/matmul/matmul_regression.sh

diff --git a/ci/blackbox.sh b/ci/blackbox.sh
index defad4c05..8bcb120f3 100755
--- a/ci/blackbox.sh
+++ b/ci/blackbox.sh
@@ -48,8 +48,6 @@ PERF_CLASS=0
 REBUILD=2
 TEMPBUILD=0
 LOGFILE=run.log
-TC_SIZE=567
-TC_NUM=123
 
 for i in "$@"
 do
@@ -182,7 +180,6 @@ then
 fi
 
 CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS"
-# CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DTC_NUM=$TC_NUM -DTC_SIZE=$TC_SIZE $L2 $L3 $PERF_FLAG $CONFIGS"
 echo "CONFIGS=$CONFIGS"
 
 if [ $REBUILD -ne 0 ]
diff --git a/ci/regression.sh.in b/ci/regression.sh.in
index 50d309af6..3c89ac996 100755
--- a/ci/regression.sh.in
+++ b/ci/regression.sh.in
@@ -124,7 +124,9 @@ regression()
     # test local barrier
     ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar"
     ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -tbar"
-    
+
+    # test for matmul
+    CONFIGS="-DTC_NUM=4 -DTC_SIZE=8" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args="-n128 -d1" 
 
     echo "regression tests done!"
 }
diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh
index 651234768..ef9306503 100644
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@@ -111,20 +111,20 @@
 `endif
 `define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE)
 
+// Size of Tensor Core
 `ifndef TC_SIZE
-`define TC_SIZE 4
+`define TC_SIZE 8
 `endif
 
+// Number of TCs per Warp
 `ifndef TC_NUM
-`define TC_NUM 1
+`define TC_NUM 4
 `endif
 
-// Number of TCU units
 `ifndef NUM_TCU_LANES
 `define NUM_TCU_LANES   `TC_NUM
 `endif
 
-// Number of TCU units
 `ifndef NUM_TCU_BLOCKS
 `define NUM_TCU_BLOCKS  `ISSUE_WIDTH
 `endif
diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh
index 9a8f93234..23fb16904 100644
--- a/hw/rtl/VX_types.vh
+++ b/hw/rtl/VX_types.vh
@@ -196,7 +196,7 @@
 `define VX_CSR_NUM_CORES                12'hFC2
 `define VX_CSR_LOCAL_MEM_BASE           12'hFC3
 
-`define VX_MAT_MUL_SIZE                 12'hFC4
+`define VX_MAT_MUL_SIZE                 12'hFC4     // VX_MAT_MUL_SIZE = Matrix Size / TC Size
 `define VX_TC_NUM                       12'hFC5
 `define VX_TC_SIZE                      12'hFC6
 
diff --git a/kernel/include/vx_intrinsics.h b/kernel/include/vx_intrinsics.h
index b67a770da..5d16d44da 100644
--- a/kernel/include/vx_intrinsics.h
+++ b/kernel/include/vx_intrinsics.h
@@ -222,21 +222,19 @@ inline void vx_fence() {
 }
 
 //Matrix load
-//Converted instruction type cause destination registers were not getiing blocked otherwise
-inline void mload(unsigned dest, unsigned  addr) 
+inline void vx_matrix_load(unsigned dest, unsigned  addr) 
 {
     asm volatile (".insn i 0x7b, 0, x0, %0(%1)" :: "i"(dest), "r"(addr));
 }
 
-//mat store
-inline void ms(unsigned  addr) 
+//Matrix Store
+inline void vx_matrix_store(unsigned  addr) 
 {
     asm volatile (".insn i 0x7b, 1, x0, 0(%0)" :: "r"(addr));
 }
 
-//mat mul
-//num tiles along reduced K dimension of matmul as imm value (can use rd,rs field to expand range of n_tiles from 12 bits)
-inline void mm() 
+//Matrix Mul
+inline void vx_matrix_mul() 
 {
     asm volatile (".insn i 0x7b, 2, x0, 0(x0)");
 }
diff --git a/run_final.sh b/run_final.sh
deleted file mode 100755
index 5f618dc64..000000000
--- a/run_final.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-# Define arrays for threads, warps, and matrix sizes
-matrix_sizes=(16 32 64 128 256 512)
-tcsizes=(8 16 32)
-tcnums=(4 8 16 32)
-#lsulanes=(4 16)
-#cores=(32)
-
-
-# Loop through each combination of threads and warps
-for size in "${matrix_sizes[@]}"; do
-    sed -i "s/OPTS ?= -n[0-9]\+/OPTS ?= -n${size}/" ../tests/regression/matmul/Makefile
-    sed -i "s/OPTS ?= -n[0-9]\+/OPTS ?= -n${size}/" tests/regression/matmul/Makefile
-    echo "Matrix size changed to ${size} in Makefile"
-    for tcsize in "${tcsizes[@]}"; do
-        for tcnum in "${tcnums[@]}"; do
-            log_name="sim_final/mat${size}/tcsize${tcsize}_tcnum${tcnum}_32w32t"
-            command="./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --tc_size=${tcsize} --tc_num=${tcnum} --rebuild=1 --perf=1  > ${log_name} 2>&1"
-            echo "$command"
-            eval "$command"
-        done
-    done    
-done
diff --git a/runtime/simx/vortex.cpp b/runtime/simx/vortex.cpp
index 4210ab0b6..5ab5e14f5 100644
--- a/runtime/simx/vortex.cpp
+++ b/runtime/simx/vortex.cpp
@@ -69,12 +69,12 @@ class vx_device {
     case VX_CAPS_NUM_CORES:
       _value = NUM_CORES * NUM_CLUSTERS;
       break;
-    // case VX_CAPS_TC_SIZE:
-    //   _value = TC_SIZE;
-    //   break;
-    // case VX_CAPS_TC_NUM:
-    //   _value = TC_NUM;
-    //   break;
+    case VX_CAPS_TC_SIZE:
+      _value = TC_SIZE;
+      break;
+    case VX_CAPS_TC_NUM:
+      _value = TC_NUM;
+      break;
     case VX_CAPS_CACHE_LINE_SIZE:
       _value = CACHE_BLOCK_SIZE;
       break;
diff --git a/sim/simx/decode.cpp b/sim/simx/decode.cpp
index 4d8d0a105..21d0e61dd 100644
--- a/sim/simx/decode.cpp
+++ b/sim/simx/decode.cpp
@@ -410,9 +410,9 @@ static const char* op_string(const Instr &instr) {
   case Opcode::TCU:
     switch(func3)
     {
-      case 0: return "ML"; //
-      case 1: return "MS"; //
-      case 2: return "MATMUL";
+      case 0: return "ML";     // Matrix Load
+      case 1: return "MS";     // Matrix Store
+      case 2: return "MATMUL"; // Matrix Multiply
       default:
         std::abort();
     }
diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp
index d2faf7f98..0dc8a06c4 100644
--- a/sim/simx/emulator.cpp
+++ b/sim/simx/emulator.cpp
@@ -74,7 +74,10 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
     , core_(core)
     , warps_(arch.num_warps(), arch)
     , barriers_(arch.num_barriers(), 0)
-    , scratchpad(std::vector<Word>(32 * 32 * 32768)) //Fix this : Max TC_SIZE = 32
+    // Currently, tradeoff between scratchpad size & performance has not been evaluated. Scratchpad is
+    // considered to be big enough to hold input tiles for one output tile.
+    // In future versions, scratchpad size should be fixed to an appropriate value.
+    , scratchpad(std::vector<Word>(32 * 32 * 32768)) 
 {
   this->clear();
 }
@@ -360,6 +363,11 @@ Word Emulator::get_tc_size()
   return tc_size;
 }
 
+Word Emulator::get_tc_num()
+{
+  return tc_num;
+}
+
 Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
   auto core_perf = core_->perf_stats();
   switch (addr) {
diff --git a/sim/simx/emulator.h b/sim/simx/emulator.h
index 743c2786e..fe3aadf81 100644
--- a/sim/simx/emulator.h
+++ b/sim/simx/emulator.h
@@ -56,7 +56,8 @@ class Emulator {
   
   Word get_tiles();
   Word get_tc_size();
-
+  Word get_tc_num();
+  
 private:
 
   struct ipdom_entry_t {
diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp
index 0dfd72a0f..20025f40b 100644
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@@ -1429,8 +1429,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
     uint32_t n_tiles = this->get_csr(VX_MAT_MUL_SIZE, 0, wid);  //CSR instruction before MLOAD will ensure that this csr has value
     int num_data_per_thread;
     int num_data_per_thread_st;
-    int num_threads_actv;
-    int num_threads_actv_st;
+    uint32_t num_threads_actv;
+    uint32_t num_threads_actv_st;
     uint32_t data_bytes_load;
     uint32_t data_bytes_store;
     uint32_t num_threads_per_tc = MAX (1, num_threads/TC_per_warp);
@@ -1506,7 +1506,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
 
         auto trace_data = std::make_shared<LsuTraceData>(num_threads);
         trace->data = trace_data;
-        uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2;
 
         for (uint32_t t = thread_start; t < num_threads_actv_st; ++t) 
         {
@@ -1521,12 +1520,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
           //Store C
           for (int n=0; n<num_data_per_thread_st; n++)
           {
-            uint64_t mem_addr = (base_addr+(n*mem_bytes));
-            uint32_t csr_index = (2*num_data_per_thread_st) + n;
-            uint32_t scratchpad_index = (tc_size*tc_size*2) + (t*num_data_per_thread) + n;
-            
-            //scratchpad -> csr (TODO :: removed intermediate CSR stage ; incorporate limited scratchmad implementation)
-            //core_->set_csr(csr_addr[(2*num_data_per_thread) + n], scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread) + n], t, warp_id_);
             Word* temp_ref = &(warp.ireg_file.at(t).at(rsrc0));
             *temp_ref = scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread_st) + n];
 
@@ -1534,7 +1527,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
           }
         }
         //Clear the scratchpad
-        for(int i =0 ; i < scratchpad.size(); i++)
+        for(long unsigned int i=0 ; i < scratchpad.size(); i++)
         {
           scratchpad[i] = 0;
         }
@@ -1545,7 +1538,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
         DP(4, "TCU MULTIPLY MAT");
         trace->fu_type = FUType::TCU;
         trace->tcu_type = TCUType::TCU_MUL;
-        uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2;
         uint32_t threads_per_tc = MAX (1, num_threads/TC_per_warp);
         for (uint32_t t = thread_start; t < num_threads_actv; ++t) 
         {
@@ -1556,12 +1548,14 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
           //TC operation [only 1 thread in 1 warp needs to do this]
           if (t%threads_per_tc == 0)
           {
-            //TODO : change to systolic array implementation
+            /*
+            // TODO : Fix needed for functional correctness
+            // TODO : change to systolic array implementation
             uint32_t thread_offset = t*(tc_size*tc_size);
+
             int loop_offset = 0;
             int offset_b = n_tiles*n_tiles*n_tiles*tc_size*tc_size;
-            /*
-            // TODO : Fix needed for functional correctness
+            uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2;
             for(int tiles = 0 ; tiles < n_tiles ; tiles++)  //What's the HW implication of this?? A counter implementation?
             { 
               for (int i = 0; i < tc_size; i++) { //ROW-1
diff --git a/sim/simx/func_unit.cpp b/sim/simx/func_unit.cpp
index f53a1fb22..8acbfddeb 100644
--- a/sim/simx/func_unit.cpp
+++ b/sim/simx/func_unit.cpp
@@ -255,7 +255,6 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
 
 TcuUnit::TcuUnit(const SimContext& ctx, Core* core) 
     : FuncUnit(ctx, core, "TCU")
-    // , tc_size (core_->arch().tc_size())
     {}
 
 void TcuUnit::tick() {
diff --git a/sim/simx/func_unit.h b/sim/simx/func_unit.h
index a7f182efe..cf119a5c3 100644
--- a/sim/simx/func_unit.h
+++ b/sim/simx/func_unit.h
@@ -103,7 +103,6 @@ class LsuUnit : public FuncUnit {
 class TcuUnit : public FuncUnit {
 public:
     TcuUnit(const SimContext& ctx, Core*);
-    // uint64_t tc_size;
     void tick();
 };
 
diff --git a/sim/simx/main.cpp b/sim/simx/main.cpp
index 9031a0a02..0f61de6f4 100644
--- a/sim/simx/main.cpp
+++ b/sim/simx/main.cpp
@@ -35,8 +35,6 @@ static void show_usage() {
 uint32_t num_threads = NUM_THREADS;
 uint32_t num_warps = NUM_WARPS;
 uint32_t num_cores = NUM_CORES;
-uint32_t tc_size = TC_SIZE;
-uint32_t  tc_num = TC_NUM;
 bool showStats = false;
 const char* program = nullptr;
 
diff --git a/tests/regression/matmul/Makefile b/tests/regression/matmul/Makefile
index 0ef207194..7f1c48523 100644
--- a/tests/regression/matmul/Makefile
+++ b/tests/regression/matmul/Makefile
@@ -9,6 +9,6 @@ SRCS := $(SRC_DIR)/main.cpp
 
 VX_SRCS := $(SRC_DIR)/kernel.cpp
 
-OPTS ?= -n512 -d1 -s4 -t4
+OPTS ?= -n128 -d1
 
 include ../common.mk
diff --git a/tests/regression/matmul/kernel.cpp b/tests/regression/matmul/kernel.cpp
index a4585fb53..b0b4753c7 100644
--- a/tests/regression/matmul/kernel.cpp
+++ b/tests/regression/matmul/kernel.cpp
@@ -107,15 +107,15 @@ void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
 		csr_write(VX_TC_NUM,TC_per_warp);
 		csr_write(VX_TC_SIZE,tc_size);
 
-		mload (0, a_addr_base);
-		mload (1, b_addr_base);
+		vx_matrix_load (0, a_addr_base);
+		vx_matrix_load (1, b_addr_base);
 		//In case of multiple threads - sync load
 		vx_fence();
 
-		mm();   //Assuming padding to ensure matrix size is a multiple of tc_size
+		vx_matrix_mul();   //Assuming padding to ensure matrix size is a multiple of tc_size
 		vx_fence();
 		if (((task_id%num_tasks_per_warp)/num_tasks_per_thread) < thread_limit_c)
-			ms(c_addr_base);
+			vx_matrix_store(c_addr_base);
 		//In case of multiple threads - sync store
 		vx_fence();
 	}	
diff --git a/tests/regression/matmul/main.cpp b/tests/regression/matmul/main.cpp
index b2238bf5a..9b3465c52 100644
--- a/tests/regression/matmul/main.cpp
+++ b/tests/regression/matmul/main.cpp
@@ -21,8 +21,6 @@
 
 const char* kernel_file = "kernel.vxbin";
 uint32_t matrix_size = 0;
-uint32_t tc_num = 4;
-uint32_t TC_size = 8;
 
 vx_device_h device = nullptr;
 vx_buffer_h A_buffer = nullptr;
@@ -41,7 +39,7 @@ static void show_usage() {
 
 static void parse_args(int argc, char **argv, uint32_t &data_size) {
   int c;
-  while ((c = getopt(argc, argv, "n:k:d:t:s:h?")) != -1) {
+  while ((c = getopt(argc, argv, "n:k:d:h?")) != -1) {
     switch (c) {
     case 'n':
       matrix_size = atoi(optarg);
@@ -52,12 +50,6 @@ static void parse_args(int argc, char **argv, uint32_t &data_size) {
     case 'd':
       data_size = atoi(optarg);
       break; 
-    case 't':
-      tc_num = atoi(optarg);
-      break;
-    case 's':
-      TC_size = atoi(optarg);
-      break; 
     case 'h':
     case '?': {
       show_usage();
@@ -151,21 +143,15 @@ int main(int argc, char *argv[]) {
   RT_CHECK(vx_dev_open(&device));
 
   uint64_t num_cores, num_warps, num_threads;
-  uint32_t tc_size, TC_per_warp;
+  uint64_t tc_size, TC_per_warp;
 
   RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
   RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
   RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
   
-  std::cout << "Debug :: tc_size (optarg) = " << TC_size << std::endl;
-  std::cout << "Debug :: tc_num (optarg) = " << tc_num << std::endl;
-
   //Add assert/knob
-  tc_size = TC_size;
-  TC_per_warp = tc_num;
-
-  // RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size));
-  // RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp));
+  RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size));
+  RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp));
 
   std::cout << "Debug :: tc_size = " << tc_size << std::endl;
   std::cout << "Debug :: tc_num = " << TC_per_warp << std::endl;
diff --git a/tests/regression/matmul/matmul_regression.sh b/tests/regression/matmul/matmul_regression.sh
new file mode 100755
index 000000000..8d35fcfd3
--- /dev/null
+++ b/tests/regression/matmul/matmul_regression.sh
@@ -0,0 +1,26 @@
+#!/bin/bash 
+
+# README:
+# This script launches a sweep of TC_SIZE, TC_NUM and MATRIX SIZES
+# default values of NUM_WARPS=32, NUM_THREADS=32, NUM_CORES=4, DATA_SIZE=1
+# Edit matrix_sizes, tcsizes & tcnums variables to vary the sweep limits
+
+# Define arrays for tc_size,tc_num and matrix sizes
+matrix_sizes=(16 32 64 128 256 512)
+tcsizes=(8 16 32)
+tcnums=(4 8 16 32)
+
+cd ../../../build/
+
+# Loop through each combination of above configs
+for size in "${matrix_sizes[@]}"; do
+    for tcsize in "${tcsizes[@]}"; do
+        for tcnum in "${tcnums[@]}"; do
+            mkdir -p sim_final/mat${size}
+            log_name="sim_final/mat${size}/tcsize${tcsize}_tcnum${tcnum}_32w32t"
+            cmd="CONFIGS=\"-DTC_NUM=${tcnum} -DTC_SIZE=${tcsize}\" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args=\"-n${size} -d1\" --rebuild=1 --perf=1  > ${log_name} 2>&1"
+            echo $cmd
+            eval $cmd
+        done
+    done    
+done

From 5e63b8f35ac3b695d574fde4ad0a280ecbe1b83a Mon Sep 17 00:00:00 2001
From: Nayan Sivakumar Nair <nayan.nair98@gmail.com>
Date: Tue, 25 Jun 2024 23:27:18 -0400
Subject: [PATCH 5/7] dummy commit

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 6eeb1ccfa..a1593a67a 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@ Vortex is a full-stack open-source RISC-V GPGPU.
 ## Specifications
 
 - Support RISC-V RV32IMAF and RV64IMAFD
+
 - Microarchitecture:
     - configurable number of cores, warps, and threads.
     - configurable number of ALU, FPU, LSU, and SFU units per core.

From d1175a03c9606dce16cdc8f16772fba701fbb0af Mon Sep 17 00:00:00 2001
From: jaewon-lee-github <jaewon.lee.here@gmail.com>
Date: Wed, 2 Oct 2024 14:16:57 -0400
Subject: [PATCH 6/7] update the code accessing registers in obsoleted way

---
 sim/simx/emulator.cpp  | 2 +-
 sim/simx/execute.cpp   | 2 +-
 sim/simx/func_unit.cpp | 6 +++++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp
index 8d57f69fa..08c51845c 100644
--- a/sim/simx/emulator.cpp
+++ b/sim/simx/emulator.cpp
@@ -74,11 +74,11 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
     , core_(core)
     , warps_(arch.num_warps(), arch)
     , barriers_(arch.num_barriers(), 0)
+    , ipdom_size_(arch.num_threads()-1)
     // [TBC] Currently, tradeoff between scratchpad size & performance has not been evaluated. Scratchpad is
     // considered to be big enough to hold input tiles for one output tile.
     // In future versions, scratchpad size should be fixed to an appropriate value.
     , scratchpad(std::vector<Word>(32 * 32 * 32768))
-    , ipdom_size_(arch.num_threads()-1)
 {
   this->clear();
 }
diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp
index a7d8a937d..e70d45cb2 100644
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@@ -1473,7 +1473,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
         trace->fu_type = FUType::LSU;
         trace->lsu_type = LsuType::TCU_LOAD;
         
-        trace->used_iregs.set(rsrc0);
+        trace->src_regs[0] = {RegType::Integer, rsrc0};
         auto trace_data = std::make_shared<LsuTraceData>(num_threads);
         trace->data = trace_data;
         
diff --git a/sim/simx/func_unit.cpp b/sim/simx/func_unit.cpp
index 2de58639b..a182f6d8b 100644
--- a/sim/simx/func_unit.cpp
+++ b/sim/simx/func_unit.cpp
@@ -222,7 +222,10 @@ void LsuUnit::tick() {
 		input.pop();
 	}
 }
-///////// TENSOR code TBC ////////////////////////////////
+/*  TO BE FIXED:Tensor_core code
+    send_request is not used anymore. Need to be modified number of load
+*/
+/*
 int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
 	int count = 0;
 
@@ -275,6 +278,7 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
 	}
 	return count;
 }
+*/
 
 ///////////////////////////////////////////////////////////////////////////////
 

From b7531c9de1d4acbc33e2040fb6f4f100eb96d015 Mon Sep 17 00:00:00 2001
From: jaewon-lee-github <jaewon.lee.here@gmail.com>
Date: Wed, 2 Oct 2024 17:46:01 -0400
Subject: [PATCH 7/7] support 64bit

---
 tests/regression/matmul/kernel.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/regression/matmul/kernel.cpp b/tests/regression/matmul/kernel.cpp
index b0b4753c7..5fa976df4 100644
--- a/tests/regression/matmul/kernel.cpp
+++ b/tests/regression/matmul/kernel.cpp
@@ -8,9 +8,9 @@ void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
 	int32_t* src0_ptr = (int32_t*)arg->src0_addr;
 	int32_t* src1_ptr = (int32_t*)arg->src1_addr;
 	int32_t* dst_ptr  = (int32_t*)arg->dst_addr;
-	unsigned a_addr = reinterpret_cast<unsigned>(src0_ptr);
-	unsigned b_addr = reinterpret_cast<unsigned>(src1_ptr);
-	unsigned c_addr = reinterpret_cast<unsigned>(dst_ptr);
+	uint64_t a_addr = reinterpret_cast<uint64_t>(src0_ptr);
+	uint64_t b_addr = reinterpret_cast<uint64_t>(src1_ptr);
+	uint64_t c_addr = reinterpret_cast<uint64_t>(dst_ptr);
 
 	uint32_t tc_size = arg->tc_size;
 	uint32_t TC_per_warp = arg->TC_per_warp;
@@ -100,9 +100,9 @@ void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
 	//TODO :: change this for new task->thread distribution
 	if (((task_id%num_tasks_per_warp)/num_tasks_per_thread) < thread_limit)
 	{	
-		unsigned a_addr_base = a_addr + offset*arg->data_size;
-		unsigned b_addr_base = b_addr + offset*arg->data_size;
-		unsigned c_addr_base = c_addr + offset_c*arg->data_size;
+		uint64_t a_addr_base = a_addr + offset*arg->data_size;
+		uint64_t b_addr_base = b_addr + offset*arg->data_size;
+		uint64_t c_addr_base = c_addr + offset_c*arg->data_size;
 		csr_write(VX_MAT_MUL_SIZE,n_tiles);
 		csr_write(VX_TC_NUM,TC_per_warp);
 		csr_write(VX_TC_SIZE,tc_size);