diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index 24f95069b..93b1b998c 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -157,7 +157,7 @@ `ifdef QUARTUS `define MAX_FANOUT 8 -`define MAX_LUTRAM 1024 +`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256) `define USE_BLOCK_BRAM (* ramstyle = "block" *) `define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *) `define NO_RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams off" *) @@ -168,7 +168,7 @@ `define STRING string `elsif VIVADO `define MAX_FANOUT 8 -`define MAX_LUTRAM 1024 +`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256) `define USE_BLOCK_BRAM (* ram_style = "block" *) `define USE_FAST_BRAM (* ram_style = "distributed" *) `define NO_RW_RAM_CHECK (* rw_addr_collision = "no" *) @@ -179,7 +179,7 @@ `define STRING `else `define MAX_FANOUT 8 -`define MAX_LUTRAM 1024 +`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256) `define USE_BLOCK_BRAM `define USE_FAST_BRAM `define NO_RW_RAM_CHECK diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv index d4a3001ad..4643c8d9f 100644 --- a/hw/rtl/cache/VX_cache.sv +++ b/hw/rtl/cache/VX_cache.sv @@ -106,10 +106,9 @@ module VX_cache import VX_gpu_pkg::*; #( localparam MEM_ARB_SEL_BITS = `CLOG2(`CDIV(NUM_BANKS, MEM_PORTS)); localparam MEM_ARB_SEL_WIDTH = `UP(MEM_ARB_SEL_BITS); - localparam CORE_RSP_REG_DISABLE = (NUM_BANKS != 1) || (NUM_REQS != 1); - localparam MEM_REQ_REG_DISABLE = (NUM_BANKS != 1); - - localparam REQ_XBAR_BUF = (NUM_REQS > 4) ? 2 : 0; + localparam REQ_XBAR_BUF = (NUM_REQS > 2) ? 2 : 0; + localparam CORE_RSP_BUF_ENABLE = (NUM_BANKS != 1) || (NUM_REQS != 1); + localparam MEM_REQ_BUF_ENABLE = (NUM_BANKS != 1); `ifdef PERF_ENABLE wire [NUM_BANKS-1:0] perf_read_miss_per_bank; @@ -133,7 +132,7 @@ module VX_cache import VX_gpu_pkg::*; #( .NUM_BANKS (NUM_BANKS), .UUID_WIDTH(UUID_WIDTH), .TAG_WIDTH (TAG_WIDTH), - .BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // bank xbar latency + .BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // request xbar latency ) flush_unit ( .clk (clk), .reset (reset), @@ -387,8 +386,8 @@ module VX_cache import VX_gpu_pkg::*; #( .UUID_WIDTH (UUID_WIDTH), .TAG_WIDTH (TAG_WIDTH), .FLAGS_WIDTH (FLAGS_WIDTH), - .CORE_OUT_REG (CORE_RSP_REG_DISABLE ? 0 : 1), - .MEM_OUT_REG (MEM_REQ_REG_DISABLE ? 0 : 1) + .CORE_OUT_REG (CORE_RSP_BUF_ENABLE ? 0 : `TO_OUT_BUF_REG(CORE_OUT_BUF)), + .MEM_OUT_REG (MEM_REQ_BUF_ENABLE ? 0 : `TO_OUT_BUF_REG(MEM_OUT_BUF)) ) bank ( .clk (clk), .reset (reset), @@ -481,7 +480,7 @@ module VX_cache import VX_gpu_pkg::*; #( for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_buf VX_elastic_buffer #( .DATAW (`CS_WORD_WIDTH + TAG_WIDTH), - .SIZE (CORE_RSP_REG_DISABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0), + .SIZE (CORE_RSP_BUF_ENABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0), .OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF)) ) core_rsp_buf ( .clk (clk), @@ -578,7 +577,7 @@ module VX_cache import VX_gpu_pkg::*; #( VX_elastic_buffer #( .DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)), - .SIZE (MEM_REQ_REG_DISABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0), + .SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0), .OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF)) ) mem_req_buf ( .clk (clk), diff --git a/hw/rtl/cache/VX_cache_top.sv b/hw/rtl/cache/VX_cache_top.sv index f992c3613..ae8cc3fc6 100644 --- a/hw/rtl/cache/VX_cache_top.sv +++ b/hw/rtl/cache/VX_cache_top.sv @@ -153,7 +153,7 @@ module VX_cache_top import VX_gpu_pkg::*; #( assign mem_rsp_ready[i] = mem_bus_if[i].rsp_ready; end - VX_cache #( + VX_cache_wrap #( .INSTANCE_ID (INSTANCE_ID), .CACHE_SIZE (CACHE_SIZE), .LINE_SIZE (LINE_SIZE), diff --git a/hw/syn/altera/dut/unittest/Makefile b/hw/syn/altera/dut/unittest/Makefile index c4479f154..3539c23b6 100644 --- a/hw/syn/altera/dut/unittest/Makefile +++ b/hw/syn/altera/dut/unittest/Makefile @@ -8,4 +8,5 @@ FPU_INCLUDE = -I$(RTL_DIR)/fpu ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src endif -RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache -I$(IP_CACHE_DIR) $(FPU_INCLUDE) \ No newline at end of file +RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache -I$(IP_CACHE_DIR) $(FPU_INCLUDE) +RTL_INCLUDE = -I.. \ No newline at end of file diff --git a/hw/syn/altera/opae/Makefile b/hw/syn/altera/opae/Makefile index 61935f2e4..ff713e329 100644 --- a/hw/syn/altera/opae/Makefile +++ b/hw/syn/altera/opae/Makefile @@ -5,7 +5,6 @@ DEVICE_FAMILY ?= arria10 PREFIX ?= build$(XLEN) TARGET ?= fpga -NUM_CORES ?= 1 SRC_DIR := $(VORTEX_HOME)/hw/syn/altera/opae @@ -44,6 +43,7 @@ ifeq ($(DEVICE_FAMILY), arria10) CONFIGS += -DALTERA_A10 endif +ifdef NUM_CORES # cluster configuration CONFIGS_1c := -DNUM_CLUSTERS=1 -DNUM_CORES=1 CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2 @@ -53,6 +53,7 @@ CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16 CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 CONFIGS += $(CONFIGS_$(NUM_CORES)c) +endif # include sources RTL_PKGS = $(AFU_DIR)/local_mem_cfg_pkg.sv $(AFU_DIR)/ccip/ccip_if_pkg.sv diff --git a/hw/syn/xilinx/README b/hw/syn/xilinx/README index a1ca231fe..7fdca65d7 100644 --- a/hw/syn/xilinx/README +++ b/hw/syn/xilinx/README @@ -47,14 +47,18 @@ TARGET=hw PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 make chipscope # analyze build report vitis_analyzer build_xilinx_u50_gen3x16_xdma_5_202210_1_hw_4c/bin/vortex_afu.xclbin.link_summary -# resuming build for routing +# resuming builds +TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.synth" make > build.log 2>&1 & +TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl" make > build.log 2>&1 & +TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl.opt_design" make > build.log 2>&1 & +TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl.place_design" make > build.log 2>&1 & +TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl.phys_opt_design" make > build.log 2>&1 & TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl.route_design" make > build.log 2>&1 & # running test FPGA_BIN_DIR= TARGET=hw_emu ./ci/blackbox.sh --driver=xrt --app=demo -FPGA_BIN_DIR= TARGET=hw ./ci/blackbox.sh --driver=xrt --app=demo -FPGA_BIN_DIR= TARGET=hw_emu ./ci/blackbox.sh --driver=xrt --app=demo -FPGA_BIN_DIR= XRT_DEVICE_INDEX=1 TARGET=hw ./ci/blackbox.sh --driver=xrt --app=demo +FPGA_BIN_DIR= TARGET=hw ./ci/blackbox.sh --driver=xrt --app=sgemm --args="-n1024" +FPGA_BIN_DIR= XRT_DEVICE_INDEX=1 TARGET=hw ./ci/blackbox.sh --driver=xrt --app=sgemm --args="-n1024" # build report logs /bin/vortex_afu.xclbin.info diff --git a/hw/syn/xilinx/dut/common.mk b/hw/syn/xilinx/dut/common.mk index 81946c88f..a3c3b9bc3 100644 --- a/hw/syn/xilinx/dut/common.mk +++ b/hw/syn/xilinx/dut/common.mk @@ -37,10 +37,15 @@ else endif clean: +ifndef RESUME rm -rf project_1 rm -rf .Xil rm -f *.rpt - rm -f vivado*.log - rm -f vivado*.jou + rm -f *.log + rm -f *.jou + rm -f *.dcp +else + @echo "RESUME is defined, skipping clean." +endif .PHONY: all gen-sources build clean \ No newline at end of file diff --git a/hw/syn/xilinx/dut/project.tcl b/hw/syn/xilinx/dut/project.tcl index 9cb173c22..fb84b586a 100644 --- a/hw/syn/xilinx/dut/project.tcl +++ b/hw/syn/xilinx/dut/project.tcl @@ -11,9 +11,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Start time -set start_time [clock seconds] - if { $::argc != 4 } { puts "ERROR: Program \"$::argv0\" requires 4 arguments!\n" puts "Usage: $::argv0 \n" @@ -46,95 +43,135 @@ if {[info exists ::env(MAX_JOBS)]} { set num_jobs 0 } -# create fpu ip -if {[info exists ::env(FPU_IP)]} { - set ip_dir $::env(FPU_IP) - set argv [list $ip_dir $device_part] - set argc 2 - source ${script_dir}/xilinx_ip_gen.tcl +proc run_setup {} { + global project_name + global top_module device_part vcs_file xdc_file + global script_dir source_dir + global num_jobs + global argv argc ;# Using global system variables: argv and argc + + # create fpu ip + if {[info exists ::env(FPU_IP)]} { + set ip_dir $::env(FPU_IP) + set argv [list $ip_dir $device_part] + set argc 2 + source ${script_dir}/xilinx_ip_gen.tcl + } + + source "${script_dir}/parse_vcs_list.tcl" + set vlist [parse_vcs_list "${vcs_file}"] + + set vsources_list [lindex $vlist 0] + set vincludes_list [lindex $vlist 1] + set vdefines_list [lindex $vlist 2] + + #puts $vsources_list + #puts $vincludes_list + #puts $vdefines_list + # Create project + create_project $project_name $project_name -force -part $device_part + + # Add constrains file + read_xdc $xdc_file + + # Add the design sources + add_files -norecurse -verbose $vsources_list + + # process defines + set_property verilog_define ${vdefines_list} [current_fileset] + + # add fpu ip + if {[info exists ::env(FPU_IP)]} { + set ip_dir $::env(FPU_IP) + add_files -norecurse -verbose ${ip_dir}/xil_fma/xil_fma.xci + add_files -norecurse -verbose ${ip_dir}/xil_fdiv/xil_fdiv.xci + add_files -norecurse -verbose ${ip_dir}/xil_fsqrt/xil_fsqrt.xci + } + + # Synthesis + set_property top $top_module [current_fileset] + set_property \ + -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} \ + -value {-mode out_of_context -flatten_hierarchy "rebuilt"} \ + -objects [get_runs synth_1] + + # register compilation hooks + #set_property STEPS.SYNTH_DESIGN.TCL.PRE ${source_dir}/pre_synth_hook.tcl [get_runs synth_1] + #set_property STEPS.SYNTH_DESIGN.TCL.POST ${source_dir}/post_synth_hook.tcl [get_runs synth_1] + set_property STEPS.OPT_DESIGN.TCL.PRE ${script_dir}/xilinx_async_bram_patch.tcl [get_runs impl_1] + #set_property STEPS.OPT_DESIGN.TCL.POST ${source_dir}/post_opt_hook.tcl [get_runs impl_1] + #set_property STEPS.ROUTE_DESIGN.TCL.PRE ${source_dir}/pre_route_hook.tcl [get_runs impl_1] + #set_property STEPS.ROUTE_DESIGN.TCL.POST ${source_dir}/post_route_hook.tcl [get_runs impl_1] + + update_compile_order -fileset sources_1 } -source "${script_dir}/parse_vcs_list.tcl" -set vlist [parse_vcs_list "${vcs_file}"] - -set vsources_list [lindex $vlist 0] -set vincludes_list [lindex $vlist 1] -set vdefines_list [lindex $vlist 2] - -#puts $vsources_list -#puts $vincludes_list -#puts $vdefines_list - -# Create project -create_project $project_name $project_name -force -part $device_part +proc run_synthesis {} { + global num_jobs + + if {$num_jobs != 0} { + launch_runs synth_1 -verbose -jobs $num_jobs + } else { + launch_runs synth_1 -verbose + } + wait_on_run synth_1 + open_run synth_1 + report_utilization -file post_synth_util.rpt -hierarchical -hierarchical_percentages + write_checkpoint -force post_synth.dcp +} -# Add constrains file -read_xdc $xdc_file +proc run_implementation {} { + global num_jobs + + if {$num_jobs != 0} { + launch_runs impl_1 -verbose -jobs $num_jobs + } else { + launch_runs impl_1 -verbose + } + wait_on_run impl_1 + open_run impl_1 + report_utilization -file post_impl_util.rpt -hierarchical -hierarchical_percentages + write_checkpoint -force post_impl.dcp +} -# Add the design sources -add_files -norecurse -verbose $vsources_list +proc run_report {} { + # Generate the synthesis report + report_place_status -file place.rpt + report_route_status -file route.rpt + report_timing_summary -file timing.rpt -# process defines -set_property verilog_define ${vdefines_list} [current_fileset] + # Generate timing report + report_timing -nworst 100 -delay_type max -sort_by group -file timing.rpt -# add fpu ip -if {[info exists ::env(FPU_IP)]} { - set ip_dir $::env(FPU_IP) - add_files -norecurse -verbose ${ip_dir}/xil_fma/xil_fma.xci - add_files -norecurse -verbose ${ip_dir}/xil_fdiv/xil_fdiv.xci - add_files -norecurse -verbose ${ip_dir}/xil_fsqrt/xil_fsqrt.xci + # Generate power and drc reports + report_power -file power.rpt + report_drc -file drc.rpt } -update_compile_order -fileset sources_1 - -# Synthesis -set_property top $top_module [current_fileset] +############################################################################### -set_property \ - -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} \ - -value {-mode out_of_context -flatten_hierarchy "rebuilt"} \ - -objects [get_runs synth_1] - -# register compilation hooks -#set_property STEPS.SYNTH_DESIGN.TCL.PRE ${source_dir}/pre_synth_hook.tcl [get_runs synth_1] -#set_property STEPS.SYNTH_DESIGN.TCL.POST ${source_dir}/post_synth_hook.tcl [get_runs synth_1] -set_property STEPS.OPT_DESIGN.TCL.PRE ${script_dir}/xilinx_async_bram_patch.tcl [get_runs impl_1] -#set_property STEPS.OPT_DESIGN.TCL.POST ${source_dir}/post_opt_hook.tcl [get_runs impl_1] -#set_property STEPS.ROUTE_DESIGN.TCL.PRE ${source_dir}/pre_route_hook.tcl [get_runs impl_1] -#set_property STEPS.ROUTE_DESIGN.TCL.POST ${source_dir}/post_route_hook.tcl [get_runs impl_1] +# Start time +set start_time [clock seconds] -if {$num_jobs != 0} { - launch_runs synth_1 -verbose -jobs $num_jobs -} else { - launch_runs synth_1 -verbose -} -wait_on_run synth_1 -open_run synth_1 -write_checkpoint -force post_synth.dcp -report_utilization -file post_synth_util.rpt -hierarchical -hierarchical_percentages - -# Implementation -if {$num_jobs != 0} { - launch_runs impl_1 -verbose -jobs $num_jobs +set checkpoint_synth "post_synth.dcp" +set checkpoint_impl "post_impl.dcp" + +if { [file exists $checkpoint_impl] } { + puts "Resuming from post-implementation checkpoint: $checkpoint_impl" + open_checkpoint $checkpoint_impl + run_report +} elseif { [file exists $checkpoint_synth] } { + puts "Resuming from post-synthesis checkpoint: $checkpoint_synth" + open_checkpoint $checkpoint_synth + run_implementation + run_report } else { - launch_runs impl_1 -verbose + # Execute full pipeline + run_setup + run_synthesis + run_implementation + run_report } -wait_on_run impl_1 -open_run impl_1 -write_checkpoint -force post_impl.dcp -report_utilization -file post_impl_util.rpt -hierarchical -hierarchical_percentages - -# Generate the synthesis report -report_place_status -file place.rpt -report_route_status -file route.rpt -report_timing_summary -file timing.rpt - -# Generate timing report -report_timing -nworst 10 -delay_type max -sort_by group -file timing.rpt - -# Generate power and drc reports -report_power -file power.rpt -report_drc -file drc.rpt # End time and calculation set elapsed_time [expr {[clock seconds] - $start_time}] diff --git a/hw/syn/xilinx/sandbox/project.tcl.in b/hw/syn/xilinx/sandbox/project.tcl.in index bb1bf86f2..b12d51d5b 100644 --- a/hw/syn/xilinx/sandbox/project.tcl.in +++ b/hw/syn/xilinx/sandbox/project.tcl.in @@ -458,7 +458,7 @@ if { [file exists post_impl.dcp] } { run_implementation run_report } else { - # execute full pipeline + # Execute full pipeline run_setup run_synthesis run_implementation diff --git a/hw/syn/xilinx/xrt/Makefile b/hw/syn/xilinx/xrt/Makefile index 288031e2e..936596671 100644 --- a/hw/syn/xilinx/xrt/Makefile +++ b/hw/syn/xilinx/xrt/Makefile @@ -15,7 +15,6 @@ endif TARGET ?= hw PLATFORM ?= -NUM_CORES ?= 1 PREFIX ?= build$(XLEN) MAX_JOBS ?= 8 @@ -64,6 +63,7 @@ DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU +ifdef NUM_CORES # cluster configuration CONFIGS_1c := -DNUM_CLUSTERS=1 -DNUM_CORES=1 CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2 @@ -73,6 +73,7 @@ CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16 CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 CONFIGS += $(CONFIGS_$(NUM_CORES)c) +endif # include sources RTL_PKGS = $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv diff --git a/hw/syn/yosys/Makefile b/hw/syn/yosys/Makefile index a09d9198d..3e4d930e4 100644 --- a/hw/syn/yosys/Makefile +++ b/hw/syn/yosys/Makefile @@ -5,7 +5,6 @@ SRC_DIR := $(VORTEX_HOME)/hw/syn/yosys TOP_LEVEL_ENTITY ?= Vortex PREFIX ?= build -NUM_CORES ?= 1 SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts RTL_DIR := $(VORTEX_HOME)/hw/rtl @@ -30,7 +29,7 @@ DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU - +ifdef NUM_CORES # cluster configuration CONFIGS_1c := -DNUM_CLUSTERS=1 -DNUM_CORES=1 CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2 @@ -40,6 +39,7 @@ CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16 -DL2_ENABLE CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 -DL2_ENABLE CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 -DL2_ENABLE CONFIGS += $(CONFIGS_$(NUM_CORES)c) +endif # include paths FPU_INCLUDE = -I$(RTL_DIR)/fpu