Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tensor core to SimX #142

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Vortex is a full-stack open-source RISC-V GPGPU.
## Specifications

- Support RISC-V RV32IMAF and RV64IMAFD

- Microarchitecture:
- configurable number of cores, warps, and threads.
- configurable number of ALU, FPU, LSU, and SFU units per core.
Expand Down
3 changes: 3 additions & 0 deletions ci/regression.sh.in
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ regression()
# test temp driver mode for
./ci/blackbox.sh --driver=simx --app=vecadd --rebuild=3

# test for matmul
CONFIGS="-DTC_NUM=4 -DTC_SIZE=8" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args="-n128 -d1"

echo "regression tests done!"
}

Expand Down
18 changes: 18 additions & 0 deletions hw/rtl/VX_config.vh
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,24 @@
`define SOCKET_SIZE `MIN(4, `NUM_CORES)
`endif

// Size of Tensor Core
`ifndef TC_SIZE
`define TC_SIZE 8
`endif

// Number of TCs per Warp
`ifndef TC_NUM
`define TC_NUM 4
`endif

`ifndef NUM_TCU_LANES
`define NUM_TCU_LANES `TC_NUM
`endif

`ifndef NUM_TCU_BLOCKS
`define NUM_TCU_BLOCKS `ISSUE_WIDTH
`endif

`ifdef L2_ENABLE
`define L2_ENABLED 1
`else
Expand Down
6 changes: 6 additions & 0 deletions hw/rtl/VX_types.vh
Original file line number Diff line number Diff line change
Expand Up @@ -201,4 +201,10 @@
`define VX_CSR_NUM_CORES 12'hFC2
`define VX_CSR_LOCAL_MEM_BASE 12'hFC3

`define VX_MAT_MUL_SIZE 12'hFC4 // VX_MAT_MUL_SIZE = Matrix Size / TC Size
`define VX_TC_NUM 12'hFC5
`define VX_TC_SIZE 12'hFC6



`endif // VX_TYPES_VH
18 changes: 18 additions & 0 deletions kernel/include/vx_intrinsics.h
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be good to document the additional instructions.
You probably did that in either a design doc or your poster.
You could link that here or document it in the repository.

Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,24 @@ inline void vx_fence() {
__asm__ volatile ("fence iorw, iorw");
}

//Matrix load
inline void vx_matrix_load(unsigned dest, unsigned addr)
{
asm volatile (".insn i 0x7b, 0, x0, %0(%1)" :: "i"(dest), "r"(addr));
}

//Matrix Store
inline void vx_matrix_store(unsigned addr)
{
asm volatile (".insn i 0x7b, 1, x0, 0(%0)" :: "r"(addr));
}

//Matrix Mul
inline void vx_matrix_mul()
{
asm volatile (".insn i 0x7b, 2, x0, 0(x0)");
}

#ifdef __cplusplus
}
#endif
Expand Down
2 changes: 2 additions & 0 deletions runtime/include/vortex.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ typedef void* vx_buffer_h;
#define VX_CAPS_ISA_FLAGS 0x7
#define VX_CAPS_NUM_MEM_BANKS 0x8
#define VX_CAPS_MEM_BANK_SIZE 0x9
#define VX_CAPS_TC_SIZE 0xA
#define VX_CAPS_TC_NUM 0xB

// device isa flags
#define VX_ISA_STD_A (1ull << ISA_STD_A)
Expand Down
6 changes: 6 additions & 0 deletions runtime/simx/vortex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ class vx_device {
case VX_CAPS_NUM_CORES:
_value = NUM_CORES * NUM_CLUSTERS;
break;
case VX_CAPS_TC_SIZE:
_value = TC_SIZE;
break;
case VX_CAPS_TC_NUM:
_value = TC_NUM;
break;
case VX_CAPS_CACHE_LINE_SIZE:
_value = CACHE_BLOCK_SIZE;
break;
Expand Down
3 changes: 2 additions & 1 deletion sim/simx/arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class Arch {
uint64_t local_mem_base_;

public:
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores)
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores)
: num_threads_(num_threads)
, num_warps_(num_warps)
, num_cores_(num_cores)
Expand Down Expand Up @@ -70,6 +70,7 @@ class Arch {
uint16_t socket_size() const {
return socket_size_;
}

};

}
4 changes: 3 additions & 1 deletion sim/simx/core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,14 @@ Core::Core(const SimContext& ctx,
dispatchers_.at((int)FUType::FPU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES);
dispatchers_.at((int)FUType::LSU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_LSU_BLOCKS, NUM_LSU_LANES);
dispatchers_.at((int)FUType::SFU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_SFU_BLOCKS, NUM_SFU_LANES);

dispatchers_.at((int)FUType::TCU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_TCU_BLOCKS, NUM_TCU_LANES);

// initialize execute units
func_units_.at((int)FUType::ALU) = SimPlatform::instance().create_object<AluUnit>(this);
func_units_.at((int)FUType::FPU) = SimPlatform::instance().create_object<FpuUnit>(this);
func_units_.at((int)FUType::LSU) = SimPlatform::instance().create_object<LsuUnit>(this);
func_units_.at((int)FUType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this);
func_units_.at((int)FUType::TCU) = SimPlatform::instance().create_object<TcuUnit>(this);

// bind commit arbiters
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
Expand Down
1 change: 1 addition & 0 deletions sim/simx/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ class Core : public SimObject<Core> {
friend class AluUnit;
friend class FpuUnit;
friend class SfuUnit;
friend class TcuUnit;
};

} // namespace vortex
19 changes: 19 additions & 0 deletions sim/simx/decode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ static const std::unordered_map<Opcode, InstType> sc_instTable = {
{Opcode::EXT2, InstType::R4},
{Opcode::R_W, InstType::R},
{Opcode::I_W, InstType::I},
{Opcode::TCU, InstType::I},
};

enum Constants {
Expand Down Expand Up @@ -405,6 +406,16 @@ static const char* op_string(const Instr &instr) {
default:
std::abort();
}

case Opcode::TCU:
switch(func3)
{
case 0: return "ML"; // Matrix Load
case 1: return "MS"; // Matrix Store
case 2: return "MATMUL"; // Matrix Multiply
default:
std::abort();
}
default:
std::abort();
}
Expand Down Expand Up @@ -548,6 +559,14 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {

case InstType::I: {
switch (op) {
case Opcode::TCU: {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which instruction does this correspond to?

instr->setDestReg(rs1, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->setFunc3(func3);
instr->setFunc7(func7);
auto imm = code >> shift_rs2;
instr->setImm(sext(imm, width_i_imm));
} break;
case Opcode::I:
case Opcode::I_W:
case Opcode::JALR:
Expand Down
40 changes: 39 additions & 1 deletion sim/simx/emulator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
, warps_(arch.num_warps(), arch)
, barriers_(arch.num_barriers(), 0)
, ipdom_size_(arch.num_threads()-1)
// [TBC] Currently, tradeoff between scratchpad size & performance has not been evaluated. Scratchpad is
// considered to be big enough to hold input tiles for one output tile.
// In future versions, scratchpad size should be fixed to an appropriate value.
, scratchpad(std::vector<Word>(32 * 32 * 32768))
{
this->clear();
}
Expand Down Expand Up @@ -111,6 +115,11 @@ void Emulator::clear() {
active_warps_.set(0);
warps_[0].tmask.set(0);
wspawn_.valid = false;

for (auto& reg : scratchpad)
{
reg = 0;
}
}

void Emulator::attach_ram(RAM* ram) {
Expand Down Expand Up @@ -343,6 +352,21 @@ void Emulator::cout_flush() {
case (addr + (VX_CSR_MPM_BASE_H-VX_CSR_MPM_BASE)) : return ((value >> 32) & 0xFFFFFFFF)
#endif

Word Emulator::get_tiles()
{
return mat_size;
}

Word Emulator::get_tc_size()
{
return tc_size;
}

Word Emulator::get_tc_num()
{
return tc_num;
}

Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
auto core_perf = core_->perf_stats();
switch (addr) {
Expand Down Expand Up @@ -374,6 +398,10 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_NUM_CORES: return uint32_t(arch_.num_cores()) * arch_.num_clusters();
case VX_CSR_LOCAL_MEM_BASE: return arch_.local_mem_base();
case VX_CSR_MSCRATCH: return csr_mscratch_;
case VX_MAT_MUL_SIZE: return mat_size;
case VX_TC_NUM: return tc_num;
case VX_TC_SIZE: return tc_size;

CSR_READ_64(VX_CSR_MCYCLE, core_perf.cycles);
CSR_READ_64(VX_CSR_MINSTRET, core_perf.instrs);
default:
Expand Down Expand Up @@ -486,6 +514,16 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) {
case VX_CSR_MNSTATUS:
case VX_CSR_MCAUSE:
break;
case VX_MAT_MUL_SIZE:
mat_size = value;
break;
case VX_TC_NUM:
tc_num = value;
break;
case VX_TC_SIZE:
tc_size = value;
break;

default: {
std::cout << "Error: invalid CSR write addr=0x" << std::hex << addr << ", value=0x" << value << std::dec << std::endl;
std::abort();
Expand All @@ -502,4 +540,4 @@ void Emulator::update_fcrs(uint32_t fflags, uint32_t tid, uint32_t wid) {
this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, tid, wid) | fflags, tid, wid);
this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, tid, wid) | fflags, tid, wid);
}
}
}
10 changes: 9 additions & 1 deletion sim/simx/emulator.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,11 @@ class Emulator {
bool wspawn(uint32_t num_warps, Word nextPC);

int get_exitcode() const;


Word get_tiles();
Word get_tc_size();
Word get_tc_num();

private:

struct ipdom_entry_t {
Expand Down Expand Up @@ -127,6 +131,10 @@ class Emulator {
uint32_t ipdom_size_;
Word csr_mscratch_;
wspawn_t wspawn_;
std::vector<Word> scratchpad;
uint32_t mat_size;
uint32_t tc_size;
uint32_t tc_num;
};

}
Expand Down
Loading
Loading